Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -118,7 +118,6 @@ SmallVector ParamTys; SmallVector Arguments; FastMathFlags FMF; - ElementCount VF = ElementCount::getFixed(1); // If ScalarizationCost is UINT_MAX, the cost of scalarizing the // arguments and the return value will be computed based on types. unsigned ScalarizationCost = std::numeric_limits::max(); @@ -155,7 +154,6 @@ Intrinsic::ID getID() const { return IID; } const IntrinsicInst *getInst() const { return II; } Type *getReturnType() const { return RetTy; } - ElementCount getVectorFactor() const { return VF; } FastMathFlags getFlags() const { return FMF; } unsigned getScalarizationCost() const { return ScalarizationCost; } const SmallVectorImpl &getArgs() const { return Arguments; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1210,12 +1210,9 @@ Type *RetTy = ICA.getReturnType(); - ElementCount VF = ICA.getVectorFactor(); ElementCount RetVF = (RetTy->isVectorTy() ? cast(RetTy)->getElementCount() : ElementCount::getFixed(1)); - assert((RetVF.isScalar() || VF.isScalar()) && - "VF > 1 and RetVF is a vector type"); const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); @@ -1225,15 +1222,13 @@ case Intrinsic::cttz: // FIXME: If necessary, this should go in target-specific overrides. - if (VF.isScalar() && RetVF.isScalar() && - getTLI()->isCheapToSpeculateCttz()) + if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) return TargetTransformInfo::TCC_Basic; break; case Intrinsic::ctlz: // FIXME: If necessary, this should go in target-specific overrides. - if (VF.isScalar() && RetVF.isScalar() && - getTLI()->isCheapToSpeculateCtlz()) + if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) return TargetTransformInfo::TCC_Basic; break; @@ -1241,16 +1236,14 @@ return thisT()->getMemcpyCost(ICA.getInst()); case Intrinsic::masked_scatter: { - assert(VF.isScalar() && "Can't vectorize types here."); const Value *Mask = Args[3]; bool VarMask = !isa(Mask); Align Alignment = cast(Args[2])->getAlignValue(); return thisT()->getGatherScatterOpCost(Instruction::Store, - Args[0]->getType(), Args[1], + ICA.getArgTypes()[0], Args[1], VarMask, Alignment, CostKind, I); } case Intrinsic::masked_gather: { - assert(VF.isScalar() && "Can't vectorize types here."); const Value *Mask = Args[2]; bool VarMask = !isa(Mask); Align Alignment = cast(Args[1])->getAlignValue(); @@ -1346,32 +1339,20 @@ return BaseT::getIntrinsicInstrCost(ICA, CostKind); // Assume that we need to scalarize this intrinsic. - SmallVector Types; - for (const Value *Op : Args) { - Type *OpTy = Op->getType(); - assert(VF.isScalar() || !OpTy->isVectorTy()); - Types.push_back(VF.isScalar() - ? OpTy - : FixedVectorType::get(OpTy, VF.getKnownMinValue())); - } - - if (VF.isVector() && !RetTy->isVoidTy()) - RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue()); - // Compute the scalarization overhead based on Args for a vector - // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while - // CostModel will pass a vector RetTy and VF is 1. + // intrinsic. unsigned ScalarizationCost = std::numeric_limits::max(); - if (RetVF.isVector() || VF.isVector()) { + if (RetVF.isVector()) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); ScalarizationCost += - getOperandsScalarizationOverhead(Args, VF.getKnownMinValue()); + getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue()); } - IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I); + IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, + ScalarizationCost, I); return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -77,26 +77,34 @@ ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); } +static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { + if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) + return Elt; + return VectorType::get(Elt, VF); +} + IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, - ElementCount Factor) - : RetTy(CI.getType()), IID(Id), VF(Factor) { + ElementCount VF) + : RetTy(MaybeVectorizeType(CI.getType(), VF)), IID(Id) { - assert(!Factor.isScalable() && "Scalable vectors are not yet supported"); + assert(!VF.isScalable() && "Scalable vectors are not yet supported"); if (auto *FPMO = dyn_cast(&CI)) FMF = FPMO->getFastMathFlags(); Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end()); FunctionType *FTy = CI.getCalledFunction()->getFunctionType(); - ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); + std::transform(FTy->param_begin(), FTy->param_end(), ParamTys.begin(), + [VF](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); } IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, - ElementCount Factor, + ElementCount VF, unsigned ScalarCost) - : RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) { + : RetTy(MaybeVectorizeType(CI.getType(), VF)), IID(Id), + ScalarizationCost(ScalarCost) { if (const auto *FPMO = dyn_cast(&CI)) FMF = FPMO->getFastMathFlags(); @@ -104,7 +112,8 @@ Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end()); FunctionType *FTy = CI.getCalledFunction()->getFunctionType(); - ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); + std::transform(FTy->param_begin(), FTy->param_end(), ParamTys.begin(), + [VF](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); } IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -713,39 +713,27 @@ if (ICA.isTypeBasedOnly()) return getTypeBasedIntrinsicInstrCost(ICA, CostKind); - Type *RetTy = ICA.getReturnType(); - unsigned VF = ICA.getVectorFactor().getFixedValue(); unsigned RetVF = (RetTy->isVectorTy() ? cast(RetTy)->getNumElements() : 1); - assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); // Assume that we need to scalarize this intrinsic. - SmallVector Types; - for (const Value *Op : Args) { - Type *OpTy = Op->getType(); - assert(VF == 1 || !OpTy->isVectorTy()); - Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF)); - } - - if (VF > 1 && !RetTy->isVoidTy()) - RetTy = FixedVectorType::get(RetTy, VF); // Compute the scalarization overhead based on Args for a vector // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while // CostModel will pass a vector RetTy and VF is 1. unsigned ScalarizationCost = std::numeric_limits::max(); - if (RetVF > 1 || VF > 1) { + if (RetVF > 1) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); - ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); + ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF); } - IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF, + IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, ScalarizationCost, I); return getIntrinsicInstrCost(Attrs, CostKind); } @@ -766,9 +754,20 @@ // TODO: Get more refined intrinsic costs? unsigned InstRate = getQuarterRateInstrCost(CostKind); - if (ICA.getID() == Intrinsic::fma) { + + switch (ICA.getID()) { + case Intrinsic::fma: InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) : getQuarterRateInstrCost(CostKind); + break; + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: + static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16}; + if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) + NElts = 1; + break; } return LT.first * NElts * InstRate; Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1531,21 +1531,16 @@ case Intrinsic::usub_sat: { if (!ST->hasMVEIntegerOps()) break; - // Get the Return type, either directly of from ICA.ReturnType and ICA.VF. Type *VT = ICA.getReturnType(); - if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar()) - VT = VectorType::get(VT, ICA.getVectorFactor()); std::pair LT = TLI->getTypeLegalizationCost(DL, VT); if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || LT.second == MVT::v16i8) { - // This is a base cost of 1 for the vadd, plus 3 extract shifts if we + // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we // need to extend the type, as it uses shr(qadd(shl, shl)). - unsigned Instrs = LT.second.getScalarSizeInBits() == - ICA.getReturnType()->getScalarSizeInBits() - ? 1 - : 4; + unsigned Instrs = + LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4; return LT.first * ST->getMVEVectorCostFactor() * Instrs; } break; Index: llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -8,9 +8,9 @@ ; CHECK-COST-LABEL: sadd ; CHECK-COST: Found an estimated cost of 10 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 26 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 58 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 122 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* nocapture noalias %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @saddsat( @@ -21,29 +21,38 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 15 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934590 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576 ; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]] ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i16, i16* [[PSRC:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: [[IND_END4:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[OFFSET:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT9]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <2 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[WIDE_LOAD]], <2 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[NEXT_GEP5]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> [[TMP4]], <2 x i16>* [[TMP5]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, <8 x i16>* [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[NEXT_GEP6]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* [[TMP8]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[NEXT_GEP6]], i64 8 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP7]], <8 x i16>* [[TMP10]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] @@ -57,10 +66,10 @@ ; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP7]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i16, i16* [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i16 [[TMP8]], i16* [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP13]], i16* [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP2:!llvm.loop !.*]] @@ -90,10 +99,10 @@ ; CHECK-COST-LABEL: umin ; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 6 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 14 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 30 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 62 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @umin( @@ -107,78 +116,87 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 15 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 31 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934560 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[NEXT_GEP2]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[NEXT_GEP3]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* [[TMP8]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i64 16 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP10]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[CAST_CRD7:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD7]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 8 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[IND_END19:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[CAST_CRD12:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END13:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD12]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[BLOCKSIZE]], -1 -; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP8]], 1 -; CHECK-NEXT: [[N_VEC4:%.*]] = and i64 [[TMP9]], 8589934584 -; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC4]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[BLOCKSIZE]], -1 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], 1 +; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP14]], 8589934584 +; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC9]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]] -; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC4]] -; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]] +; CHECK-NEXT: [[IND_END18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX5]] -; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX5]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[NEXT_GEP17]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP10]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[NEXT_GEP18]] to <8 x i8>* -; CHECK-NEXT: store <8 x i8> [[TMP11]], <8 x i8>* [[TMP12]], align 2 -; CHECK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX5]], 8 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP22:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX10]] +; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX10]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[NEXT_GEP22]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]]) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[NEXT_GEP23]] to <8 x i8>* +; CHECK-NEXT: store <8 x i8> [[TMP16]], <8 x i8>* [[TMP17]], align 2 +; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX10]], 8 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP9]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP14]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i8* [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i8* [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i8* [ [[IND_END15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END16]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8* [ [[IND_END18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL9]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP15:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP14]], i8 [[OFFSET]]) +; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP20:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP19]], i8 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i8 [[TMP15]], i8* [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i8 [[TMP20]], i8* [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP6:!llvm.loop !.*]] Index: llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -157,7 +157,7 @@ ; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) ; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) ; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 1 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 4 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @cttz( Index: llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll +++ llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll @@ -4,20 +4,22 @@ ; Regression test for a bug in the SLP vectorizer that was causing ; these rotates to be incorrectly combined into a vector rotate. -; The bug fix is at https://reviews.llvm.org/D85759. This test has -; been pre-committed to demonstrate the regressed behavior and provide -; a clear diff for the bug fix. - target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" target triple = "wasm32-unknown-unknown" define void @foo(<2 x i64> %x, <4 x i32> %y, i64* %out) #0 { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[X:%.*]], <2 x i64> [[X]], <2 x i64> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[OUT:%.*]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 8 +; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0 +; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2 +; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[B]] to i64 +; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]]) +; CHECK-NEXT: store i64 [[C]], i64* [[OUT:%.*]], align 8 +; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1 +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3 +; CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[E]] to i64 +; CHECK-NEXT: [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]]) +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[OUT]], i32 1 +; CHECK-NEXT: store i64 [[F]], i64* [[ARRAYIDX2]], align 8 ; CHECK-NEXT: ret void ; %a = extractelement <2 x i64> %x, i32 0