diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1268,10 +1268,10 @@ def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue">; def SVPTRUE : SInst<"svptrue_{d}", "P", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL]>; -def SVDUPQ_B8 : SInst<"svdupq[_n]_{d}", "Pssssssssssssssss", "Pc", MergeNone>; -def SVDUPQ_B16 : SInst<"svdupq[_n]_{d}", "Pssssssss", "Ps", MergeNone>; -def SVDUPQ_B32 : SInst<"svdupq[_n]_{d}", "Pssss", "Pi", MergeNone>; -def SVDUPQ_B64 : SInst<"svdupq[_n]_{d}", "Pss", "Pl", MergeNone>; +def SVDUPQ_B8 : SInst<"svdupq[_n]_{d}", "Pssssssssssssssss", "Pc", MergeNone, "aarch64_sve_dupq_b8">; +def SVDUPQ_B16 : SInst<"svdupq[_n]_{d}", "Pssssssss", "Ps", MergeNone, "aarch64_sve_dupq_b16">; +def SVDUPQ_B32 : SInst<"svdupq[_n]_{d}", "Pssss", "Pi", MergeNone, "aarch64_sve_dupq_b32">; +def SVDUPQ_B64 : SInst<"svdupq[_n]_{d}", "Pss", "Pl", MergeNone, "aarch64_sve_dupq_b64">; def SVDUP_N_B : SInst<"svdup[_n]_{d}", "Ps", "PcPsPiPl", MergeNone>; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -8956,8 +8956,12 @@ Ops[1] = Builder.CreateCall(Sel, {Ops[0], Ops[1], SplatZero}); } - Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, - getSVEOverloadTypes(TypeFlags, Ty, Ops)); + Function *F; + if (Intrinsic::isOverloaded(Builtin->LLVMIntrinsic)) + F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, + getSVEOverloadTypes(TypeFlags, Ty, Ops)); + else + F = CGM.getIntrinsic(Builtin->LLVMIntrinsic); Value *Call = Builder.CreateCall(F, Ops); // Predicate results must be converted to svbool_t. @@ -9031,10 +9035,6 @@ return EmitSVEPredicateCast(Dup, cast(Ty)); } - case SVE::BI__builtin_sve_svdupq_n_b8: - case SVE::BI__builtin_sve_svdupq_n_b16: - case SVE::BI__builtin_sve_svdupq_n_b32: - case SVE::BI__builtin_sve_svdupq_n_b64: case SVE::BI__builtin_sve_svdupq_n_u8: case SVE::BI__builtin_sve_svdupq_n_s8: case SVE::BI__builtin_sve_svdupq_n_u64: @@ -9051,21 +9051,13 @@ // ld1rq to materialize a vector. unsigned NumOpnds = Ops.size(); - bool IsBoolTy = - cast(Ty)->getElementType()->isIntegerTy(1); - - // For svdupq_n_b* the element type of is an integer of type 128/numelts, - // so that the compare can use the width that is natural for the expected - // number of predicate lanes. llvm::Type *EltTy = Ops[0]->getType(); - if (IsBoolTy) - EltTy = IntegerType::get(getLLVMContext(), SVEBitsPerBlock / NumOpnds); Address Alloca = CreateTempAlloca(llvm::ArrayType::get(EltTy, NumOpnds), CharUnits::fromQuantity(16)); for (unsigned I = 0; I < NumOpnds; ++I) Builder.CreateDefaultAlignedStore( - IsBoolTy ? Builder.CreateZExt(Ops[I], EltTy) : Ops[I], + Ops[I], Builder.CreateGEP(Alloca.getElementType(), Alloca.getPointer(), {Builder.getInt64(0), Builder.getInt64(I)})); @@ -9077,18 +9069,7 @@ Value *Alloca0 = Builder.CreateGEP( Alloca.getElementType(), Alloca.getPointer(), {Builder.getInt64(0), Builder.getInt64(0)}); - Value *LD1RQ = Builder.CreateCall(F, {Pred, Alloca0}); - - if (!IsBoolTy) - return LD1RQ; - - // For svdupq_n_b* we need to add an additional 'cmpne' with '0'. - F = CGM.getIntrinsic(NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne - : Intrinsic::aarch64_sve_cmpne_wide, - OverloadedTy); - Value *Call = - Builder.CreateCall(F, {Pred, LD1RQ, EmitSVEDupX(Builder.getInt64(0))}); - return EmitSVEPredicateCast(Call, cast(Ty)); + return Builder.CreateCall(F, {Pred, Alloca0}); } case SVE::BI__builtin_sve_svpfalse_b: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c @@ -289,20 +289,8 @@ bool x12, bool x13, bool x14, bool x15) { // CHECK-LABEL: test_svdupq_n_b8 - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [16 x i8], align 16 - // CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i8 - // CHECK-DAG: %[[X15:.*]] = zext i1 %x15 to i8 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i8 %[[X0]], i8* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 15 - // CHECK: store i8 %[[X15]], i8* %[[GEP]], align 1 - // CHECK-NOT: store - // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv16i8( %[[PTRUE]], i8* nonnull %[[BASE]]) - // CHECK: %[[ZERO:.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) - // CHECK: %[[CMP:.*]] = call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %[[PTRUE]], %[[LOAD]], %[[ZERO]]) - // CHECK: ret %[[CMP]] + // CHECK: %[[DUP:.*]] = call @llvm.aarch64.sve.dupq.b8(i1 %x0, i1 %x1, i1 %x2, i1 %x3, i1 %x4, i1 %x5, i1 %x6, i1 %x7, i1 %x8, i1 %x9, i1 %x10, i1 %x11, i1 %x12, i1 %x13, i1 %x14, i1 %x15) + // CHECK: ret %[[DUP]] return SVE_ACLE_FUNC(svdupq,_n,_b8,)(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); } @@ -310,20 +298,8 @@ bool x4, bool x5, bool x6, bool x7) { // CHECK-LABEL: test_svdupq_n_b16 - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [8 x i16], align 16 - // CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i16 - // CHECK-DAG: %[[X7:.*]] = zext i1 %x7 to i16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i16 %[[X0]], i16* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 7 - // CHECK: store i16 %[[X7]], i16* %[[GEP]], align 2 - // CHECK-NOT: store - // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv8i16( %{{.*}}, i16* nonnull %[[BASE]]) - // CHECK: %[[ZERO:.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) - // CHECK: %[[CMP:.*]] = call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %{{.*}}, %[[LOAD]], %[[ZERO]]) - // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %[[CMP]]) + // CHECK: %[[DUP:.*]] = call @llvm.aarch64.sve.dupq.b16(i1 %x0, i1 %x1, i1 %x2, i1 %x3, i1 %x4, i1 %x5, i1 %x6, i1 %x7) + // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %[[DUP]]) // CHECK: ret %[[CAST]] return SVE_ACLE_FUNC(svdupq,_n,_b16,)(x0, x1, x2, x3, x4, x5, x6, x7); } @@ -331,20 +307,8 @@ svbool_t test_svdupq_n_b32(bool x0, bool x1, bool x2, bool x3) { // CHECK-LABEL: test_svdupq_n_b32 - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [4 x i32], align 16 - // CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i32 - // CHECK-DAG: %[[X3:.*]] = zext i1 %x3 to i32 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i32 %[[X0]], i32* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 3 - // CHECK: store i32 %[[X3]], i32* %[[GEP]], align 4 - // CHECK-NOT: store - // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv4i32( %{{.*}}, i32* nonnull %[[BASE]]) - // CHECK: %[[ZERO:.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) - // CHECK: %[[INTRINSIC:.*]] = call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %{{.*}}, %[[LOAD]], %[[ZERO]]) - // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %[[INTRINSIC]]) + // CHECK: %[[DUP:.*]] = call @llvm.aarch64.sve.dupq.b32(i1 %x0, i1 %x1, i1 %x2, i1 %x3) + // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %[[DUP]]) // CHECK: ret %[[CAST]] return SVE_ACLE_FUNC(svdupq,_n,_b32,)(x0, x1, x2, x3); } @@ -352,19 +316,8 @@ svbool_t test_svdupq_n_b64(bool x0, bool x1) { // CHECK-LABEL: test_svdupq_n_b64 - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [2 x i64], align 16 - // CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i64 - // CHECK-DAG: %[[X1:.*]] = zext i1 %x1 to i64 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i64 %[[X0]], i64* %[[BASE]], align 16 - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 1 - // CHECK: store i64 %[[X1]], i64* %[[GEP]], align 8 - // CHECK-NOT: store - // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv2i64( %{{.*}}, i64* nonnull %[[BASE]]) - // CHECK: %[[ZERO:.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) - // CHECK: %[[INTRINSIC:.*]] = call @llvm.aarch64.sve.cmpne.nxv2i64( %{{.*}}, %[[LOAD]], %[[ZERO]]) - // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %[[INTRINSIC]]) + // CHECK: %[[DUP:.*]] = call @llvm.aarch64.sve.dupq.b64(i1 %x0, i1 %x1) + // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %[[DUP]]) // CHECK: ret %[[CAST]] return SVE_ACLE_FUNC(svdupq,_n,_b64,)(x0, x1); } diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1150,6 +1150,30 @@ : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMVectorElementType<0>], [IntrNoMem]>; + class AdvSIMD_SVE_DUPQ_B64_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv2i1_ty], + [llvm_i1_ty, llvm_i1_ty], + [IntrNoMem]>; + + class AdvSIMD_SVE_DUPQ_B32_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv4i1_ty], + [llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, llvm_i1_ty], + [IntrNoMem]>; + + class AdvSIMD_SVE_DUPQ_B16_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv8i1_ty], + [llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, + llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, llvm_i1_ty], + [IntrNoMem]>; + + class AdvSIMD_SVE_DUPQ_B8_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], + [llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, + llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, + llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, + llvm_i1_ty, llvm_i1_ty, llvm_i1_ty, llvm_i1_ty], + [IntrNoMem]>; + class AdvSIMD_SVE_DUPQ_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, @@ -1591,8 +1615,12 @@ // Scalar to vector operations // -def int_aarch64_sve_dup : AdvSIMD_SVE_DUP_Intrinsic; -def int_aarch64_sve_dup_x : AdvSIMD_SVE_DUP_Unpred_Intrinsic; +def int_aarch64_sve_dup : AdvSIMD_SVE_DUP_Intrinsic; +def int_aarch64_sve_dup_x : AdvSIMD_SVE_DUP_Unpred_Intrinsic; +def int_aarch64_sve_dupq_b64 : AdvSIMD_SVE_DUPQ_B64_Intrinsic; +def int_aarch64_sve_dupq_b32 : AdvSIMD_SVE_DUPQ_B32_Intrinsic; +def int_aarch64_sve_dupq_b16 : AdvSIMD_SVE_DUPQ_B16_Intrinsic; +def int_aarch64_sve_dupq_b8 : AdvSIMD_SVE_DUPQ_B8_Intrinsic; def int_aarch64_sve_index : AdvSIMD_SVE_Index_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -943,6 +943,7 @@ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDUPQ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, bool OverrideNEON = false) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1257,6 +1257,7 @@ setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); @@ -3899,6 +3900,11 @@ return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Data, Op.getOperand(1)); } + case Intrinsic::aarch64_sve_dupq_b64: + case Intrinsic::aarch64_sve_dupq_b32: + case Intrinsic::aarch64_sve_dupq_b16: + case Intrinsic::aarch64_sve_dupq_b8: + return LowerDUPQ(Op, DAG); case Intrinsic::aarch64_sve_dupq_lane: return LowerDUPQLane(Op, DAG); case Intrinsic::aarch64_sve_convert_from_svbool: @@ -9244,6 +9250,62 @@ return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); } +SDValue AArch64TargetLowering::LowerDUPQ(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + + unsigned BitWidth = AArch64::SVEBitsPerBlock / (Op.getNumOperands() - 1); + + EVT VT = Op.getValueType(); + EVT EltVT = MVT::getIntegerVT(BitWidth); + EVT CmpVT = VT.changeVectorElementType(EltVT); + + SmallVector ConstantVec; + SmallVector NonConstantOps; + for (unsigned I = 1; I < Op.getNumOperands(); ++I) { + auto *Operand = dyn_cast(Op.getOperand(I)); + if (!Operand) + NonConstantOps.push_back(I - 1); + unsigned Value = Operand ? Operand->getZExtValue() : 0; + ConstantVec.push_back(Constant::getIntegerValue( + EltVT.getTypeForEVT(*DAG.getContext()), APInt(BitWidth, Value))); + } + Constant *C = ConstantVector::get(ConstantVec); + SDValue CPool = DAG.getConstantPool(C, MVT::i64, Align(4)); + + SDValue Chain = DAG.getEntryNode(); + SDValue Pred = getPTrue(DAG, DL, VT, AArch64SVEPredPattern::all); + + SDValue CmpOp; + if (NonConstantOps.empty()) { + CmpOp = DAG.getNode(AArch64ISD::LD1RQ_MERGE_ZERO, DL, {CmpVT, MVT::Other}, + {Chain, Pred, CPool}); + } else { + EVT LoadVT = + EVT::getVectorVT(*DAG.getContext(), EltVT, Op.getNumOperands() - 1); + SDValue Load = DAG.getLoad(LoadVT, DL, Chain, CPool, MachinePointerInfo()); + for (unsigned I = 0; I < NonConstantOps.size(); ++I) { + unsigned Idx = NonConstantOps[I]; + SDValue Operand = + DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op.getOperand(Idx + 1)); + Load = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LoadVT, Load, Operand, + DAG.getConstant(Idx, DL, MVT::i64)); + } + SDValue InsSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::nxv2i64, + DAG.getUNDEF(MVT::nxv2i64), Load, + DAG.getConstant(0, DL, MVT::i64)); + SDNode *Dup = + DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, InsSubVec, + DAG.getTargetConstant(0, DL, MVT::i64)); + CmpOp = DAG.getNode(ISD::BITCAST, DL, CmpVT, SDValue(Dup, 0)); + } + + SDValue Splat = + DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, DAG.getConstant(0, DL, EltVT)); + + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred, CmpOp, Splat, + DAG.getCondCode(ISD::SETNE)); +} + SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -390,6 +390,60 @@ return IC.replaceInstUsesWith(II, Insert); } +static Optional instCombineSVEDupQ(InstCombiner &IC, + IntrinsicInst &II) { + LLVMContext &Ctx = II.getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(&II); + + unsigned NumOperands = II.getNumArgOperands(); + unsigned PredicateBits = 0; + + // Expand intrinsic operands to a 16-bit byte level predicate + for (unsigned I = 0; I < NumOperands; ++I) { + auto *Arg = dyn_cast(II.getArgOperand(I)); + if (!Arg) + return None; + if (Arg->getZExtValue() != 0) + PredicateBits |= 1 << (I * (16 / NumOperands)); + } + + // If all bits are zero bail early with an empty predicate + if (PredicateBits == 0) { + auto *PFalse = Constant::getNullValue(II.getType()); + PFalse->takeName(&II); + return IC.replaceInstUsesWith(II, PFalse); + } + + // Calculate largest predicate type used (where byte predicate is largest) + unsigned Mask = 8; + for (unsigned I = 0; I < 16; ++I) + if ((PredicateBits & (1 << I)) != 0) + Mask |= (I % 8); + + unsigned PredSize = Mask & -Mask; + auto *PredType = ScalableVectorType::get( + Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); + + // Ensure all relevant bits are set + for (unsigned I = 0; I < 16; I += PredSize) + if ((PredicateBits & (1 << I)) == 0) + return None; + + auto *PTruePat = + ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); + auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, + {PredType}, {PTruePat}); + auto *ConvertToSVBool = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); + auto *ConvertFromSVBool = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, + {II.getType()}, {ConvertToSVBool}); + + ConvertFromSVBool->takeName(&II); + return IC.replaceInstUsesWith(II, ConvertFromSVBool); +} + static Optional instCombineSVELast(InstCombiner &IC, IntrinsicInst &II) { Value *Pg = II.getArgOperand(0); @@ -498,6 +552,11 @@ return instCombineConvertFromSVBool(IC, II); case Intrinsic::aarch64_sve_dup: return instCombineSVEDup(IC, II); + case Intrinsic::aarch64_sve_dupq_b64: + case Intrinsic::aarch64_sve_dupq_b32: + case Intrinsic::aarch64_sve_dupq_b16: + case Intrinsic::aarch64_sve_dupq_b8: + return instCombineSVEDupQ(IC, II); case Intrinsic::aarch64_sve_rdffr: return instCombineRDFFR(IC, II); case Intrinsic::aarch64_sve_lasta: diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-dupq.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-dupq.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-dupq.ll @@ -0,0 +1,188 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sve -asm-verbose=0 < %s | FileCheck %s + +define @dupq_b8() { +; CHECK-LABEL: LCPI0_0: +; CHECK: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: dupq_b8: +; CHECK: adrp x8, .LCPI0_0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.b8(i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, + i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true) + ret %out +} + +define @dupq_b16() { +; CHECK-LABEL: LCPI1_0: +; CHECK: .hword 0 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 1 +; CHECK-NEXT: .hword 1 +; CHECK-NEXT: .hword 1 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 0 +; CHECK-LABEL: dupq_b16: +; CHECK: adrp x8, .LCPI1_0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add x8, x8, :lo12:.LCPI1_0 +; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x8] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.b16(i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false) + ret %out +} + +define @dupq_b32() { +; CHECK-LABEL: LCPI2_0: +; CHECK: .word 0 +; CHECK-NEXT: .word 0 +; CHECK-NEXT: .word 1 +; CHECK-NEXT: .word 1 +; CHECK-LABEL: dupq_b32: +; CHECK: adrp x8, .LCPI2_0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add x8, x8, :lo12:.LCPI2_0 +; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x8] +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.b32(i1 false, i1 false, i1 true, i1 true) + ret %out +} + +define @dupq_b64() { +; CHECK-LABEL: LCPI3_0: +; CHECK: .xword 1 +; CHECK-NEXT: .xword 1 +; CHECK-LABEL: dupq_b64: +; CHECK: adrp x8, .LCPI3_0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, x8, :lo12:.LCPI3_0 +; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x8] +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.b64(i1 true, i1 true) + ret %out +} + +define @dupq_b8_nonconst(i1 %a, i1 %b) { +; CHECK-LABEL: LCPI4_0: +; CHECK: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 0 +; CHECK-LABEL: dupq_b8_nonconst: +; CHECK: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: and w9, w1, #0x1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov v0.b[12], w8 +; CHECK-NEXT: mov v0.b[15], w9 +; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.b8(i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, + i1 false, i1 false, i1 false, i1 true, i1 %a, i1 false, i1 false, i1 %b) + ret %out +} + +define @dupq_b16_nonconst(i1 %a, i1 %b) { +; CHECK-LABEL: LCPI5_0: +; CHECK: .hword 0 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 1 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 1 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 0 +; CHECK-NEXT: .hword 0 +; CHECK-LABEL: dupq_b16_nonconst: +; CHECK: adrp x8, .LCPI5_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: and w9, w1, #0x1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.b16(i1 false, i1 %a, i1 true, i1 %b, i1 true, i1 false, i1 false, i1 false) + ret %out +} + +define @dupq_b32_nonconst(i1 %a, i1 %b) { +; CHECK-LABEL: LCPI6_0: +; CHECK: .word 0 +; CHECK-NEXT: .word 0 +; CHECK-NEXT: .word 1 +; CHECK-NEXT: .word 0 +; CHECK-LABEL: dupq_b32_nonconst: +; CHECK: adrp x8, .LCPI6_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: and w9, w1, #0x1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: mov v0.s[3], w9 +; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.b32(i1 false, i1 %a, i1 true, i1 %b) + ret %out +} + +define @dupq_b64_nonconst(i1 %a) { +; CHECK-LABEL: LCPI7_0: +; CHECK: .xword 1 +; CHECK-NEXT: .xword 0 +; CHECK-LABEL: dupq_b64_nonconst: +; CHECK: adrp x8, .LCPI7_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov z0.q, q0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.dupq.b64(i1 true, i1 %a) + ret %out +} + +declare @llvm.aarch64.sve.dupq.b8(i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1) +declare @llvm.aarch64.sve.dupq.b16(i1, i1, i1, i1, i1, i1, i1, i1) +declare @llvm.aarch64.sve.dupq.b32(i1, i1, i1, i1) +declare @llvm.aarch64.sve.dupq.b64(i1, i1) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dupq.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dupq.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dupq.ll @@ -0,0 +1,195 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; DUPQ b8 + +define @dupq_b_0() #0 { +; CHECK-LABEL: @dupq_b_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.dupq.b8(i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, + i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false) + ret %1 +} + +define @dupq_b_d() #0 { +; CHECK-LABEL: @dupq_b_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.dupq.b8(i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, + i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false) + ret %1 +} + +define @dupq_b_w() #0 { +; CHECK-LABEL: @dupq_b_w( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.dupq.b8(i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, + i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false) + ret %1 +} + +define @dupq_b_h() #0 { +; CHECK-LABEL: @dupq_b_h( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %1) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.dupq.b8(i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, + i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false) + ret %1 +} + +define @dupq_b_b() #0 { +; CHECK-LABEL: @dupq_b_b( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.dupq.b8(i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, + i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true) + ret %1 +} + +; DUPQ b16 + +define @dupq_h_0() #0 { +; CHECK-LABEL: @dupq_h_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.dupq.b16(i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false) + ret %1 +} + +define @dupq_h_d() #0 { +; CHECK-LABEL: @dupq_h_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; CHECK-NEXT: %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %2) +; CHECK-NEXT: ret %3 + %1 = tail call @llvm.aarch64.sve.dupq.b16(i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false) + ret %1 +} + +define @dupq_h_w() #0 { +; CHECK-LABEL: @dupq_h_w( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +; CHECK-NEXT: %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %2) +; CHECK-NEXT: ret %3 + %1 = tail call @llvm.aarch64.sve.dupq.b16(i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false) + ret %1 +} + +define @dupq_h_h() #0 { +; CHECK-LABEL: @dupq_h_h( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.dupq.b16(i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true) + ret %1 +} + +; DUPQ b32 + +define @dupq_w_0() #0 { +; CHECK-LABEL: @dupq_w_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.dupq.b32(i1 false, i1 false, i1 false, i1 false) + ret %1 +} + +define @dupq_w_d() #0 { +; CHECK-LABEL: @dupq_w_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; CHECK-NEXT: %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %2) +; CHECK-NEXT: ret %3 + %1 = tail call @llvm.aarch64.sve.dupq.b32(i1 true, i1 false, i1 true, i1 false) + ret %1 +} + +define @dupq_w_w() #0 { +; CHECK-LABEL: @dupq_w_w( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.dupq.b32(i1 true, i1 true, i1 true, i1 true) + ret %1 +} + +; DUPQ b64 + +define @dupq_d_0() #0 { +; CHECK-LABEL: @dupq_d_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.dupq.b64(i1 false, i1 false) + ret %1 +} +define @dupq_d_d() #0 { +; CHECK-LABEL: @dupq_d_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.dupq.b64(i1 true, i1 true) + ret %1 +} + +; Cases that cannot be converted + +define @dupq_neg1() #0 { +; CHECK-LABEL: @dupq_neg1( +; CHECK-NOT: ptrue +; CHECK-NOT: zeroinitializer +; CHECK: ret + %1 = tail call @llvm.aarch64.sve.dupq.b64(i1 true, i1 false) + ret %1 +} + +define @dupq_neg2() #0 { +; CHECK-LABEL: @dupq_neg2( +; CHECK-NOT: ptrue +; CHECK-NOT: zeroinitializer +; CHECK: ret + %1 = tail call @llvm.aarch64.sve.dupq.b32(i1 true, i1 false, i1 false, i1 true) + ret %1 +} + +define @dupq_neg3() #0 { +; CHECK-LABEL: @dupq_neg3( +; CHECK-NOT: ptrue +; CHECK-NOT: zeroinitializer +; CHECK: ret + %1 = tail call @llvm.aarch64.sve.dupq.b32(i1 false, i1 true, i1 false, i1 true) + ret %1 +} + +define @dupq_neg4() #0 { +; CHECK-LABEL: @dupq_neg4( +; CHECK-NOT: ptrue +; CHECK-NOT: zeroinitializer +; CHECK: ret + %1 = tail call @llvm.aarch64.sve.dupq.b32(i1 true, i1 true, i1 false, i1 false) + ret %1 +} + +define @dupq_neg5() #0 { +; CHECK-LABEL: @dupq_neg5( +; CHECK-NOT: ptrue +; CHECK-NOT: zeroinitializer +; CHECK: ret + %1 = tail call @llvm.aarch64.sve.dupq.b32(i1 false, i1 false, i1 false, i1 true) + ret %1 +} + +define @dupq_neg6(i1 %a) #0 { +; CHECK-LABEL: @dupq_neg6( +; CHECK-NOT: ptrue +; CHECK-NOT: zeroinitializer +; CHECK: ret + %1 = tail call @llvm.aarch64.sve.dupq.b32(i1 true, i1 true, i1 true, i1 %a) + ret %1 +} + +declare @llvm.aarch64.sve.dupq.b8(i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1, i1) +declare @llvm.aarch64.sve.dupq.b16(i1, i1, i1, i1, i1, i1, i1, i1) +declare @llvm.aarch64.sve.dupq.b32(i1, i1, i1, i1) +declare @llvm.aarch64.sve.dupq.b64(i1, i1) + +attributes #0 = { "target-features"="+sve" }