Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17133,11 +17133,11 @@ SDLoc DL(N); EVT VT = InVec.getValueType(); - unsigned NumElts = VT.getVectorNumElements(); auto *IndexC = dyn_cast(EltNo); // Insert into out-of-bounds element is undefined. - if (IndexC && IndexC->getZExtValue() >= VT.getVectorNumElements()) + if (IndexC && VT.isFixedLengthVector() && + IndexC->getZExtValue() >= VT.getVectorNumElements()) return DAG.getUNDEF(VT); // Remove redundant insertions: @@ -17150,12 +17150,21 @@ // If this is variable insert to undef vector, it might be better to splat: // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) { - SmallVector Ops(NumElts, InVal); - return DAG.getBuildVector(VT, DL, Ops); + if (VT.isScalableVector()) + return DAG.getSplatVector(VT, DL, InVal); + else { + SmallVector Ops(VT.getVectorNumElements(), InVal); + return DAG.getBuildVector(VT, DL, Ops); + } } return SDValue(); } + if (VT.isScalableVector()) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + // We must know which element is being inserted for folds below here. unsigned Elt = IndexC->getZExtValue(); if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5615,8 +5615,11 @@ llvm_unreachable("should use getVectorShuffle constructor!"); case ISD::INSERT_VECTOR_ELT: { ConstantSDNode *N3C = dyn_cast(N3); - // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF - if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements()) + // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except + // for scalable vectors where we will generate appropriate code to + // deal with out-of-bounds cases correctly. + if (N3C && N1.getValueType().isFixedLengthVector() && + N3C->getZExtValue() >= N1.getValueType().getVectorNumElements()) return getUNDEF(VT); // Undefined index can be assumed out-of-bounds, so that's UNDEF too. Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1767,6 +1767,77 @@ // 16-element contiguous store defm : st1; + def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)), + (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + + // Insert scalar into vector[0] + def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)), + (CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>; + + def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)), + (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)), + (SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)), + (SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>; + + // Insert scalar into vector with scalar index + def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_B ZPR:$vec, + (CMPEQ_PPzZZ_B (PTRUE_B 31), + (INDEX_II_B 0, 1), + (DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_H ZPR:$vec, + (CMPEQ_PPzZZ_H (PTRUE_H 31), + (INDEX_II_H 0, 1), + (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_S ZPR:$vec, + (CMPEQ_PPzZZ_S (PTRUE_S 31), + (INDEX_II_S 0, 1), + (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)), + (CPY_ZPmR_D ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D GPR64:$index)), + GPR64:$src)>; + + // Insert FP scalar into vector with scalar index + def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)), + (CPY_ZPmV_H ZPR:$vec, + (CMPEQ_PPzZZ_H (PTRUE_H 31), + (INDEX_II_H 0, 1), + (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + $src)>; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)), + (CPY_ZPmV_S ZPR:$vec, + (CMPEQ_PPzZZ_S (PTRUE_S 31), + (INDEX_II_S 0, 1), + (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + $src)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)), + (CPY_ZPmV_D ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D $index)), + $src)>; } let Predicates = [HasSVE, HasMatMulInt8] in { Index: llvm/test/CodeGen/AArch64/sve-insert-element.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-insert-element.ll @@ -0,0 +1,135 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define @test_lane0_16xi8( %a) { +; CHECK-LABEL: test_lane0_16xi8 +; CHECK: mov [[REG:.*]], #30 +; CHECK: mov z0.b, p{{[0-7]}}/m, [[REG]] + %b = insertelement %a, i8 30, i32 0 + ret %b +} + +define @test_lane0_8xi16( %a) { +; CHECK-LABEL: test_lane0_8xi16 +; CHECK: mov [[REG:.*]], #30 +; CHECK: mov z0.h, p{{[0-7]}}/m, [[REG]] + %b = insertelement %a, i16 30, i32 0 + ret %b +} + +define @test_lane0_4xi32( %a) { +; CHECK-LABEL: test_lane0_4xi32 +; CHECK: mov [[REG:.*]], #30 +; CHECK: mov z0.s, p{{[0-7]}}/m, [[REG]] + %b = insertelement %a, i32 30, i32 0 + ret %b +} + +define @test_lane0_2xi64( %a) { +; CHECK-LABEL: test_lane0_2xi64 +; CHECK: mov w[[REG:.*]], #30 +; CHECK: mov z0.d, p{{[0-7]}}/m, x[[REG]] + %b = insertelement %a, i64 30, i32 0 + ret %b +} + +define @test_lane0_2xf64( %a) { +; CHECK-LABEL: test_lane0_2xf64 +; CHECK: fmov d[[REG:[0-9]+]], #1.00000000 +; CHECK: mov z0.d, p{{[0-7]}}/m, z[[REG]].d + %b = insertelement %a, double 1.0, i32 0 + ret %b +} + +define @test_lane0_4xf32( %a) { +; CHECK-LABEL: test_lane0_4xf32 +; CHECK: fmov s[[REG:[0-9]+]], #1.00000000 +; CHECK: mov z0.s, p{{[0-7]}}/m, z[[REG]].s + %b = insertelement %a, float 1.0, i32 0 + ret %b +} + +define @test_lane0_8xf16( %a) { +; CHECK-LABEL: test_lane0_8xf16 +; CHECK: fmov h[[REG:[0-9]+]], #1.00000000 +; CHECK: mov z0.h, p{{[0-7]}}/m, z[[REG]].h + %b = insertelement %a, half 1.0, i32 0 + ret %b +} + +; Undefined lane insert +define @test_lane4_2xi64( %a) { +; CHECK-LABEL: test_lane4_2xi64 +; CHECK: mov w[[IDXREG:.*]], #4 +; CHECK: index z[[CMPVEC:[0-9]+]].d, #0, #1 +; CHECK: mov z[[IDXVEC:[0-9]+]].d, x[[IDXREG]] +; CHECK: cmpeq p[[PRED:[0-9]+]].d, p{{[0-7]}}/z, z[[CMPVEC]].d, z[[IDXVEC]].d +; CHECK: mov w[[VALREG:.*]], #30 +; CHECK: mov z0.d, p[[PRED]]/m, x[[VALREG]] + %b = insertelement %a, i64 30, i32 4 + ret %b +} + +; Undefined lane insert +define @test_lane9_8xf16( %a) { +; CHECK-LABEL: test_lane9_8xf16 +; CHECK: mov w[[IDXREG:.*]], #9 +; CHECK: index z[[CMPVEC:[0-9]+]].h, #0, #1 +; CHECK: mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]] +; CHECK: cmpeq p[[PRED:[0-9]+]].h, p{{[0-7]}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h +; CHECK: fmov h[[VALREG:[0-9]+]], #1.00000000 +; CHECK: mov z0.h, p[[PRED]]/m, h[[VALREG]] + %b = insertelement %a, half 1.0, i32 9 + ret %b +} + +define @test_lane1_16xi8( %a) { +; CHECK-LABEL: test_lane1_16xi8 +; CHECK: mov w[[IDXREG:.*]], #1 +; CHECK: index z[[CMPVEC:[0-9]+]].b, #0, #1 +; CHECK: mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]] +; CHECK: cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b +; CHECK: mov w[[VALREG:.*]], #30 +; CHECK: mov z0.b, p[[PRED]]/m, w[[VALREG]] + %b = insertelement %a, i8 30, i32 1 + ret %b +} + +define @test_lanex_16xi8( %a, i32 %x) { +; CHECK-LABEL: test_lanex_16xi8 +; CHECK: index z[[CMPVEC:[0-9]+]].b, #0, #1 +; CHECK: mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]] +; CHECK: cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b +; CHECK: mov w[[VALREG:.*]], #30 +; CHECK: mov z0.b, p[[PRED]]/m, w[[VALREG]] + %b = insertelement %a, i8 30, i32 %x + ret %b +} + + +; Redundant lane insert +define @extract_insert_4xi32( %a) { +; CHECK-LABEL: extract_insert_4xi32 +; CHECK-NOT: mov w{{.*}}, #30 +; CHECK-NOT: mov z0.d + %b = extractelement %a, i32 2 + %c = insertelement %a, i32 %b, i32 2 + ret %c +} + +define @test_lane6_undef_8xi16(i16 %a) { +; CHECK-LABEL: test_lane6_undef_8xi16 +; CHECK: mov w[[IDXREG:.*]], #6 +; CHECK: index z[[CMPVEC:.*]].h, #0, #1 +; CHECK: mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]] +; CHECK: cmpeq p[[PRED:.*]].h, p{{.*}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h +; CHECK: mov z0.h, p[[PRED]]/m, w0 + %b = insertelement undef, i16 %a, i32 6 + ret %b +} + +define @test_lane0_undef_16xi8(i8 %a) { +; CHECK-LABEL: test_lane0_undef_16xi8 +; CHECK: fmov s0, w0 + %b = insertelement undef, i8 %a, i32 0 + ret %b +}