diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6347,12 +6347,55 @@ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), ssub))>; - def : Pat<(i64 (intOp (v4i32 V128:$Rn))), (i64 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), dsub))>; + +def : Pat<(v16i8 (vector_insert v16i8:$src, (i32 (intOp (v8i8 V64:$Rm))), imm:$Immd)), + (INSvi8lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rm), hsub), 0)>; + +def : Pat<(v8i16 (vector_insert v8i16:$src, (i32 (intOp (v8i8 V64:$Rm))), imm:$Immd)), + (INSvi16lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rm), hsub), 0)>; +def : Pat<(v8i16 (vector_insert v8i16:$src, (i32 (intOp (v16i8 V128:$Rm))), imm:$Immd)), + (INSvi16lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rm), hsub), 0)>; +def : Pat<(v8i16 (vector_insert v8i16:$src, (i32 (intOp (v4i16 V64:$Rm))), imm:$Immd)), + (INSvi16lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rm), ssub), 0)>; +def : Pat<(v8i16 (vector_insert v8i16:$src, (i32 (intOp (v8i16 V128:$Rm))), imm:$Immd)), + (INSvi16lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rm), ssub), 0)>; + +def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (intOp (v8i8 V64:$Rm))), imm:$Immd)), + (INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rm), hsub), 0)>; +def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (intOp (v16i8 V128:$Rm))), imm:$Immd)), + (INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rm), hsub), 0)>; +def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (intOp (v4i16 V64:$Rm))), imm:$Immd)), + (INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rm), ssub), 0)>; +def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (intOp (v8i16 V128:$Rm))), imm:$Immd)), + (INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rm), ssub), 0)>; + +def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (intOp (v4i32 V128:$Rm))), imm:$Immd)), + (INSvi64lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rm), dsub), 0)>; + +def : Pat<(v16i8 (vector_insert v16i8:$src, (i32 (trunc (i64 (intOp (v4i32 V128:$Rm))))), imm:$Immd)), + (INSvi8lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rm), dsub), 0)>; +def : Pat<(v8i16 (vector_insert v8i16:$src, (i32 (trunc (i64 (intOp (v4i32 V128:$Rm))))), imm:$Immd)), + (INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rm), dsub), 0)>; +def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (trunc (i64 (intOp (v4i32 V128:$Rm))))), imm:$Immd)), + (INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rm), dsub), 0)>; } defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>; diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -13,8 +13,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov.s v1[0], w8 +; CHECK-NEXT: mov.s v1[0], v0[0] ; CHECK-NEXT: ucvtf.2s v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -32,8 +31,7 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s1, v0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.s v0[0], w8 +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -50,12 +48,11 @@ ; CHECK-LABEL: _insert_vec_v16i32_uaddlv_from_v8i16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: movi.2d v2, #0000000000000000 -; CHECK-NEXT: uaddlv.8h s1, v0 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: uaddlv.8h s2, v0 ; CHECK-NEXT: stp q0, q0, [x0, #32] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.s v2[0], w8 -; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: mov.s v1[0], v2[0] +; CHECK-NEXT: ucvtf.4s v1, v1 ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret @@ -71,16 +68,15 @@ ; CHECK-LABEL: _insert_vec_v23i32_uaddlv_from_v8i16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: movi.2d v2, #0000000000000000 -; CHECK-NEXT: uaddlv.8h s1, v0 +; CHECK-NEXT: add x8, x0, #88 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: uaddlv.8h s2, v0 ; CHECK-NEXT: stp q0, q0, [x0, #16] ; CHECK-NEXT: stp q0, q0, [x0, #48] -; CHECK-NEXT: str d0, [x0, #80] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.s v2[0], w8 -; CHECK-NEXT: add x8, x0, #88 ; CHECK-NEXT: st1.s { v0 }[2], [x8] -; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: mov.s v1[0], v2[0] +; CHECK-NEXT: str d0, [x0, #80] +; CHECK-NEXT: ucvtf.4s v1, v1 ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret @@ -98,8 +94,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.16b h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov.s v1[0], w8 +; CHECK-NEXT: mov.s v1[0], v0[0] ; CHECK-NEXT: ucvtf.2s v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -118,8 +113,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.8b h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov.s v1[0], w8 +; CHECK-NEXT: mov.s v1[0], v0[0] ; CHECK-NEXT: ucvtf.2s v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -138,8 +132,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.4h s0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov.s v1[0], w8 +; CHECK-NEXT: mov.s v1[0], v0[0] ; CHECK-NEXT: ucvtf.2s v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -157,12 +150,11 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: movi.2d v3, #0000000000000000 -; CHECK-NEXT: uaddlv.4s d2, v1 +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: uaddlv.4s d3, v1 ; CHECK-NEXT: str d1, [x0, #16] -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: mov.d v3[0], x8 -; CHECK-NEXT: ucvtf.2d v2, v3 +; CHECK-NEXT: mov.d v2[0], v3[0] +; CHECK-NEXT: ucvtf.2d v2, v2 ; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: mov.d v2[1], v0[0] ; CHECK-NEXT: str q2, [x0] @@ -181,8 +173,7 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d1, v0 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov.d v0[0], x8 +; CHECK-NEXT: mov.d v0[0], v1[0] ; CHECK-NEXT: ucvtf.2d v0, v0 ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: str d0, [x0] @@ -203,8 +194,7 @@ ; CHECK-NEXT: str wzr, [x0, #16] ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d2, v1 -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: mov.d v1[0], x8 +; CHECK-NEXT: mov.d v1[0], v2[0] ; CHECK-NEXT: ucvtf.2d v1, v1 ; CHECK-NEXT: fcvtn v1.2s, v1.2d ; CHECK-NEXT: mov.d v1[1], v0[0] @@ -226,8 +216,7 @@ ; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s1, v1 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] @@ -246,11 +235,10 @@ ; CHECK-LABEL: _insert_vec_v3i16_uaddlv_from_v8i16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: add x8, x0, #8 ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s1, v1 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.h v0[0], w8 -; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: st1.s { v0 }[2], [x8] @@ -271,12 +259,11 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: movi.2d v3, #0000000000000000 -; CHECK-NEXT: uaddlv.4h s2, v1 +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: uaddlv.4h s3, v1 ; CHECK-NEXT: stp q1, q1, [x0, #32] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov.s v3[0], w8 -; CHECK-NEXT: ucvtf.2d v2, v3 +; CHECK-NEXT: mov.s v2[0], v3[0] +; CHECK-NEXT: ucvtf.2d v2, v2 ; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: mov.d v2[1], v0[0] ; CHECK-NEXT: stp q2, q1, [x0] @@ -298,8 +285,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.8b h2, v1 ; CHECK-NEXT: stp q1, q1, [x0, #32] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov.b v0[0], w8 +; CHECK-NEXT: mov.b v0[0], v2[0] ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: ushll.4s v0, v0, #0 @@ -323,8 +309,7 @@ ; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.8b h1, v1 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 @@ -348,8 +333,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: stp xzr, xzr, [x0, #32] ; CHECK-NEXT: uaddlv.4h s1, v1 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] @@ -370,8 +354,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: uaddlv.4s d1, v0 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov.s v0[0], w8 +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -389,12 +372,11 @@ ; CHECK-LABEL: _insert_vec_v16i32_uaddlv_from_v4i32: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: movi.2d v2, #0000000000000000 -; CHECK-NEXT: uaddlv.4s d1, v0 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: uaddlv.4s d2, v0 ; CHECK-NEXT: stp q0, q0, [x0, #32] -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov.s v2[0], w8 -; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: mov.s v1[0], v2[0] +; CHECK-NEXT: ucvtf.4s v1, v1 ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret @@ -413,8 +395,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d0, v0 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: mov.h v1[0], w8 +; CHECK-NEXT: mov.s v1[0], v0[0] ; CHECK-NEXT: ushll.4s v0, v1, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] @@ -436,8 +417,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d2, v1 ; CHECK-NEXT: stp q1, q1, [x0, #32] -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.s v0[0], v2[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: stp q0, q1, [x0] @@ -459,8 +439,7 @@ ; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d1, v1 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 @@ -483,8 +462,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d2, v1 ; CHECK-NEXT: stp q1, q1, [x0, #32] -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: mov.b v0[0], w8 +; CHECK-NEXT: mov.b v0[0], v2[0] ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: ushll.4s v0, v0, #0