diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2766,6 +2766,11 @@ return false; } + virtual bool isInsertSubvectorLegal(EVT ResVT, EVT SrcVT, + unsigned Index) const { + return true; + } + /// Try to convert an extract element of a vector binary operation into an /// extract element followed by a scalar operation. virtual bool shouldScalarizeBinop(SDValue VecOp) const { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18306,7 +18306,7 @@ /// Convert a disguised subvector insertion into a shuffle: SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && - "Expected extract_vector_elt"); + "Expected insert_vector_elt"); SDValue InsertVal = N->getOperand(1); SDValue Vec = N->getOperand(0); @@ -18468,15 +18468,11 @@ return SDValue(); } - if (VT.isScalableVector()) - return SDValue(); - - unsigned NumElts = VT.getVectorNumElements(); - // We must know which element is being inserted for folds below here. unsigned Elt = IndexC->getZExtValue(); - if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) - return Shuf; + if (VT.isFixedLengthVector()) + if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) + return Shuf; // Canonicalize insert_vector_elt dag nodes. // Example: @@ -18502,6 +18498,42 @@ if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) return SDValue(); + unsigned NumElts = VT.getVectorMinNumElements(); + bool ScalableOut = false; + EVT FixedVT = VT; + if (VT.isScalableVector()) { + // BUILD_VECTOR does not currently support scalable vectors. Insert into + // BUILD_VECTOR through an INSERT_SUBVECTOR. The motivation for this is to + // allow conversions from to a to become a no-op + // where the fixed-vector originates from subregister of the scalable + // register. + FixedVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), NumElts); + + // Can't make an insert_subvector of this. + if (!TLI.isInsertSubvectorLegal(VT, FixedVT, 0) || + !TLI.isTypeLegal(FixedVT) || !TLI.isTypeLegal(VT) || + !TLI.isOperationLegalOrCustom(ISD::INSERT_SUBVECTOR, VT)) + return SDValue(); + + if (InVec.isUndef()) { + // InVec was undef. + InVec = DAG.getUNDEF(FixedVT); + } else if (InVec.getOpcode() == ISD::INSERT_SUBVECTOR && + InVec.getOperand(0).isUndef() && + InVec.getOperand(1).getOpcode() == ISD::BUILD_VECTOR && + InVec.getOperand(1).hasOneUse() && + InVec.getConstantOperandVal(2) == 0 && + Elt < InVec.getOperand(1) + .getValueType() + .getVectorMinNumElements()) { + // InVec was (insert_subvector undef (build_vector {...}) 0). + InVec = InVec.getOperand(1); + // InVec now (build_vector {...}). + } + // Code following is as-if insertion is against FixedVectorTy BUILD_VECTOR. + ScalableOut = true; + } + // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the // vector elements. @@ -18526,8 +18558,16 @@ Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal; } - // Return the new vector - return DAG.getBuildVector(VT, DL, Ops); + SDValue Ret = DAG.getBuildVector(FixedVT, DL, Ops); + if (ScalableOut) { + // There is no scalable build_vector, so use (insert_subvector vec + // (build_vector {...}) 0). + SDValue Zero = + DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())); + Ret = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Ret, Zero); + } + return Ret; } SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -639,6 +639,17 @@ bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; + bool isInsertSubvectorLegal(EVT ResVT, EVT SrcVT, + unsigned Index) const override { + if (ResVT.isScalableVector() && SrcVT.isFixedLengthVector()) { + // Fixed insert into Scalable only legal if the scalable result occupies + // the full vector granule. + return ResVT.getSizeInBits().getKnownMinSize() == + AArch64::SVEBitsPerBlock; + } + return true; + } + bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override { // Using overflow ops for overflow checks only should beneficial on diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -418,26 +418,51 @@ static Optional instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II) { - IntrinsicInst *Pg = dyn_cast(II.getArgOperand(1)); - if (!Pg) - return None; + assert(II.getIntrinsicID() == Intrinsic::aarch64_sve_dup && + "Expected SVE DUP!"); - if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) - return None; + auto *Ty = cast(II.getType()); + assert(Ty && "non-scalable result of scalable intrinsic"); - const auto PTruePattern = - cast(Pg->getOperand(0))->getZExtValue(); - if (PTruePattern != AArch64SVEPredPattern::vl1) + // Only consider dup producing <2 x double>. + if (!Ty || !Ty->getScalarType()->isDoubleTy() || Ty->getMinNumElements() != 2) return None; - // The intrinsic is inserting into lane zero so use an insert instead. + LLVMContext &Ctx = II.getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(&II); auto *IdxTy = Type::getInt64Ty(II.getContext()); - auto *Insert = InsertElementInst::Create( - II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); - Insert->insertBefore(&II); - Insert->takeName(&II); - return IC.replaceInstUsesWith(II, Insert); + auto m_PTrue = [](auto Pat) { + return m_Intrinsic(Pat); + }; + auto m_Dup = [](auto PassThru, auto Pg, auto Val) { + return m_Intrinsic(PassThru, Pg, Val); + }; + + auto VL2 = m_PTrue(m_SpecificInt(AArch64SVEPredPattern::vl2)); + auto VL1 = m_PTrue(m_SpecificInt(AArch64SVEPredPattern::vl1)); + Value *Out = nullptr, *PassThru, *Elem0, *Elem1; + if (match(&II, m_Dup(m_Value(PassThru), VL1, m_Value(Elem0)))) { + // (dup vec VL1 elem0) => (insertelement vec elem0 0) + Out = PassThru; + Out = Builder.CreateInsertElement(PassThru, Elem0, + ConstantInt::get(IdxTy, 0)); + } else if (match(&II, m_Dup(m_Dup(m_Value(PassThru), VL2, m_Value(Elem1)), + VL1, m_Value(Elem0)))) { + // (dup (dup vec VL2 elem1) VL1 elem0) => (insertelement (insertelement vec + // elem1 1) elem0 0) + Out = PassThru; + Out = Builder.CreateInsertElement(PassThru, Elem0, + ConstantInt::get(IdxTy, 0)); + Out = Builder.CreateInsertElement(PassThru, Elem1, + ConstantInt::get(IdxTy, 0)); + } + + if (!Out) + return None; + Out->takeName(&II); + return IC.replaceInstUsesWith(II, Out); } static Optional instCombineSVECmpNE(InstCombiner &IC, diff --git a/llvm/test/CodeGen/AArch64/dag-combine-insert-elt.ll b/llvm/test/CodeGen/AArch64/dag-combine-insert-elt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/dag-combine-insert-elt.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" +attributes #0 = {"target-features"="+sve"} + +define @two_inserts( %in) #0 { +; CHECK-LABEL: two_inserts: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov v0.2d, #1.00000000 +; CHECK-NEXT: ret + %b = insertelement undef, double 0.0, i32 0 + %a = insertelement undef, double 1.0, i32 1 + ret %a +} + + +define @neon_to_sve(<2 x double> %in) #0 { +; CHECK-LABEL: neon_to_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ret + %e0 = extractelement <2 x double> %in, i32 0 + %e1 = extractelement <2 x double> %in, i32 1 + %b = insertelement undef, double %e0, i32 0 + %a = insertelement undef, double %e1, i32 1 + ret %a +} + +; Function Attrs: nofree norecurse nosync nounwind readnone uwtable willreturn mustprogress vscale_range(4,4) +define dso_local @float32x4_t_to_svfloat32_t(<4 x float> %nv) local_unnamed_addr #0 { +; CHECK-LABEL: float32x4_t_to_svfloat32_t: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ret +entry: + %lane0_floats = shufflevector <4 x float> %nv, <4 x float> undef, <2 x i32> + %lane1_floats = shufflevector <4 x float> %nv, <4 x float> undef, <2 x i32> + + %0 = bitcast <2 x float> %lane0_floats to <1 x double> + %1 = bitcast <2 x float> %lane1_floats to <1 x double> + + %lane0 = extractelement <1 x double> %0, i32 0 + %lane1 = extractelement <1 x double> %1, i32 0 + + %b = insertelement undef, double %lane1, i32 1 + %a = insertelement %b, double %lane0, i32 0 + + %out = bitcast %a to + ret %out +} + +declare @llvm.experimental.vector.insert.nxv2f64.v2f64(, <2 x double>, i64 immarg) + +declare @llvm.aarch64.sve.dup.nxv2f64(, , double) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg) diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll @@ -12,6 +12,7 @@ ret %b } + define @test_lane0_8xi16( %a) { ; CHECK-LABEL: test_lane0_8xi16: ; CHECK: // %bb.0: @@ -56,6 +57,58 @@ ret %b } +define @test_lane01_2xf64( %a) { +; CHECK-LABEL: test_lane01_2xf64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov x9, #4636737291354636288 +; CHECK-NEXT: mov x10, #70368744177664 +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.d, vl1 +; CHECK-NEXT: movk x10, #16473, lsl #48 +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d +; CHECK-NEXT: mov z0.d, p1/m, z3.d +; CHECK-NEXT: fmov d1, x10 +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: ret + %b = insertelement %a, double 101.0, i32 1 + %c = insertelement %b, double 100.0, i32 0 + ret %c +} + +define @test_lane012_2xf64( %a) { +; CHECK-LABEL: test_lane012_2xf64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4636737291354636288 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: ptrue p1.d, vl1 +; CHECK-NEXT: index z2.d, #0, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z0.d, p1/m, z1.d +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: mov x8, #70368744177664 +; CHECK-NEXT: movk x8, #16473, lsl #48 +; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov x8, #140737488355328 +; CHECK-NEXT: movk x8, #16473, lsl #48 +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov z0.d, p1/m, d1 +; CHECK-NEXT: ret + %b = insertelement %a, double 102.0, i32 2 + %c = insertelement %b, double 101.0, i32 1 + %d = insertelement %c, double 100.0, i32 0 + ret %d +} + define @test_lane0_4xf32( %a) { ; CHECK-LABEL: test_lane0_4xf32: ; CHECK: // %bb.0: @@ -155,12 +208,7 @@ define @test_lane6_undef_8xi16(i16 %a) { ; CHECK-LABEL: test_lane6_undef_8xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #6 -; CHECK-NEXT: index z0.h, #0, #1 -; CHECK-NEXT: mov z1.h, w8 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, w0 +; CHECK-NEXT: dup v0.8h, w0 ; CHECK-NEXT: ret %b = insertelement undef, i16 %a, i32 6 ret %b diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll --- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll @@ -8,10 +8,9 @@ define @test_post_ld1_insert(i32* %a, i32** %ptr, i64 %inc) { ; CHECK-LABEL: test_post_ld1_insert: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: add x9, x0, x2, lsl #2 -; CHECK-NEXT: str x9, [x1] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: add x8, x0, x2, lsl #2 +; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: ret %load = load i32, i32* %a %ins = insertelement undef, i32 %load, i32 0 diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll @@ -1,11 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instcombine < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" define @dup_insertelement_0( %v, i8 %s) #0 { ; CHECK-LABEL: @dup_insertelement_0( -; CHECK: %insert = insertelement %v, i8 %s, i64 0 -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[PG:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) +; CHECK-NEXT: [[INSERT:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[V:%.*]], [[PG]], i8 [[S:%.*]]) +; CHECK-NEXT: ret [[INSERT]] +; %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) ret %insert @@ -13,9 +16,10 @@ define @dup_insertelement_1( %v, i8 %s) #0 { ; CHECK-LABEL: @dup_insertelement_1( -; CHECK: %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) -; CHECK-NEXT: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[PG:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) +; CHECK-NEXT: [[INSERT:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[V:%.*]], [[PG]], i8 [[S:%.*]]) +; CHECK-NEXT: ret [[INSERT]] +; %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) ret %insert @@ -23,16 +27,19 @@ define @dup_insertelement_x( %v, i8 %s, %pg) #0 { ; CHECK-LABEL: @dup_insertelement_x( -; CHECK: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[INSERT:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[V:%.*]], [[PG:%.*]], i8 [[S:%.*]]) +; CHECK-NEXT: ret [[INSERT]] +; %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) ret %insert } define @dup_insertelement_0_convert( %v, i16 %s) #0 { ; CHECK-LABEL: @dup_insertelement_0_convert( -; CHECK: %insert = insertelement %v, i16 %s, i64 0 -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[PG:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) +; CHECK-NEXT: [[INSERT:%.*]] = tail call @llvm.aarch64.sve.dup.nxv8i16( [[V:%.*]], [[PG]], i16 [[S:%.*]]) +; CHECK-NEXT: ret [[INSERT]] +; %pg = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg) %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %1)