diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -118,44 +118,32 @@ SmallVector ParamTys; SmallVector Arguments; FastMathFlags FMF; - ElementCount VF = ElementCount::getFixed(1); // If ScalarizationCost is UINT_MAX, the cost of scalarizing the // arguments and the return value will be computed based on types. unsigned ScalarizationCost = std::numeric_limits::max(); public: - IntrinsicCostAttributes(const IntrinsicInst &I); + IntrinsicCostAttributes( + Intrinsic::ID Id, const CallBase &CI, + unsigned ScalarizationCost = std::numeric_limits::max()); - IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI); - - IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, - ElementCount Factor); - - IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, - ElementCount Factor, unsigned ScalarCost); - - IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, - ArrayRef Tys, FastMathFlags Flags); - - IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, - ArrayRef Tys, FastMathFlags Flags, - unsigned ScalarCost); - - IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, - ArrayRef Tys, FastMathFlags Flags, - unsigned ScalarCost, - const IntrinsicInst *I); - - IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, - ArrayRef Tys); + IntrinsicCostAttributes( + Intrinsic::ID Id, Type *RTy, ArrayRef Tys, + FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr, + unsigned ScalarCost = std::numeric_limits::max()); IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, ArrayRef Args); + IntrinsicCostAttributes( + Intrinsic::ID Id, Type *RTy, ArrayRef Args, + ArrayRef Tys, FastMathFlags Flags = FastMathFlags(), + const IntrinsicInst *I = nullptr, + unsigned ScalarCost = std::numeric_limits::max()); + Intrinsic::ID getID() const { return IID; } const IntrinsicInst *getInst() const { return II; } Type *getReturnType() const { return RetTy; } - ElementCount getVectorFactor() const { return VF; } FastMathFlags getFlags() const { return FMF; } unsigned getScalarizationCost() const { return ScalarizationCost; } const SmallVectorImpl &getArgs() const { return Arguments; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1211,12 +1211,9 @@ Type *RetTy = ICA.getReturnType(); - ElementCount VF = ICA.getVectorFactor(); ElementCount RetVF = (RetTy->isVectorTy() ? cast(RetTy)->getElementCount() : ElementCount::getFixed(1)); - assert((RetVF.isScalar() || VF.isScalar()) && - "VF > 1 and RetVF is a vector type"); const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); @@ -1226,15 +1223,13 @@ case Intrinsic::cttz: // FIXME: If necessary, this should go in target-specific overrides. - if (VF.isScalar() && RetVF.isScalar() && - getTLI()->isCheapToSpeculateCttz()) + if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) return TargetTransformInfo::TCC_Basic; break; case Intrinsic::ctlz: // FIXME: If necessary, this should go in target-specific overrides. - if (VF.isScalar() && RetVF.isScalar() && - getTLI()->isCheapToSpeculateCtlz()) + if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) return TargetTransformInfo::TCC_Basic; break; @@ -1242,16 +1237,14 @@ return thisT()->getMemcpyCost(ICA.getInst()); case Intrinsic::masked_scatter: { - assert(VF.isScalar() && "Can't vectorize types here."); const Value *Mask = Args[3]; bool VarMask = !isa(Mask); Align Alignment = cast(Args[2])->getAlignValue(); return thisT()->getGatherScatterOpCost(Instruction::Store, - Args[0]->getType(), Args[1], + ICA.getArgTypes()[0], Args[1], VarMask, Alignment, CostKind, I); } case Intrinsic::masked_gather: { - assert(VF.isScalar() && "Can't vectorize types here."); const Value *Mask = Args[2]; bool VarMask = !isa(Mask); Align Alignment = cast(Args[1])->getAlignValue(); @@ -1289,13 +1282,13 @@ case Intrinsic::vector_reduce_fmin: case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umin: { - IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I); + IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::vector_reduce_fadd: case Intrinsic::vector_reduce_fmul: { IntrinsicCostAttributes Attrs( - IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I); + IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::fshl: @@ -1347,32 +1340,20 @@ return BaseT::getIntrinsicInstrCost(ICA, CostKind); // Assume that we need to scalarize this intrinsic. - SmallVector Types; - for (const Value *Op : Args) { - Type *OpTy = Op->getType(); - assert(VF.isScalar() || !OpTy->isVectorTy()); - Types.push_back(VF.isScalar() - ? OpTy - : FixedVectorType::get(OpTy, VF.getKnownMinValue())); - } - - if (VF.isVector() && !RetTy->isVoidTy()) - RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue()); - // Compute the scalarization overhead based on Args for a vector - // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while - // CostModel will pass a vector RetTy and VF is 1. + // intrinsic. unsigned ScalarizationCost = std::numeric_limits::max(); - if (RetVF.isVector() || VF.isVector()) { + if (RetVF.isVector()) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); ScalarizationCost += - getOperandsScalarizationOverhead(Args, VF.getKnownMinValue()); + getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue()); } - IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I); + IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, + ScalarizationCost); return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } @@ -1615,7 +1596,7 @@ // SatMin -> Overflow && SumDiff >= 0 unsigned Cost = 0; IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, - ScalarizationCostPassed); + nullptr, ScalarizationCostPassed); Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, @@ -1636,7 +1617,7 @@ unsigned Cost = 0; IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, - ScalarizationCostPassed); + nullptr, ScalarizationCostPassed); Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -54,86 +54,26 @@ return true; } -IntrinsicCostAttributes::IntrinsicCostAttributes(const IntrinsicInst &I) : - II(&I), RetTy(I.getType()), IID(I.getIntrinsicID()) { - - FunctionType *FTy = I.getCalledFunction()->getFunctionType(); - ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); - Arguments.insert(Arguments.begin(), I.arg_begin(), I.arg_end()); - if (auto *FPMO = dyn_cast(&I)) - FMF = FPMO->getFastMathFlags(); -} - -IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, - const CallBase &CI) : - II(dyn_cast(&CI)), RetTy(CI.getType()), IID(Id) { - - if (const auto *FPMO = dyn_cast(&CI)) - FMF = FPMO->getFastMathFlags(); - - Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end()); - FunctionType *FTy = - CI.getCalledFunction()->getFunctionType(); - ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); -} - IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, - ElementCount Factor) - : RetTy(CI.getType()), IID(Id), VF(Factor) { - - assert(!Factor.isScalable() && "Scalable vectors are not yet supported"); - if (auto *FPMO = dyn_cast(&CI)) - FMF = FPMO->getFastMathFlags(); - - Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end()); - FunctionType *FTy = - CI.getCalledFunction()->getFunctionType(); - ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); -} - -IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, - const CallBase &CI, - ElementCount Factor, - unsigned ScalarCost) - : RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) { + unsigned ScalarizationCost) + : II(dyn_cast(&CI)), RetTy(CI.getType()), IID(Id), + ScalarizationCost(ScalarizationCost) { if (const auto *FPMO = dyn_cast(&CI)) FMF = FPMO->getFastMathFlags(); Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end()); - FunctionType *FTy = - CI.getCalledFunction()->getFunctionType(); + FunctionType *FTy = CI.getCalledFunction()->getFunctionType(); ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end()); } -IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, - ArrayRef Tys, - FastMathFlags Flags) : - RetTy(RTy), IID(Id), FMF(Flags) { - ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); -} - -IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, - ArrayRef Tys, - FastMathFlags Flags, - unsigned ScalarCost) : - RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) { - ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); -} - IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, ArrayRef Tys, FastMathFlags Flags, - unsigned ScalarCost, - const IntrinsicInst *I) : - II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) { - ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); -} - -IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, - ArrayRef Tys) : - RetTy(RTy), IID(Id) { + const IntrinsicInst *I, + unsigned ScalarCost) + : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) { ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); } @@ -147,6 +87,17 @@ ParamTys.push_back(Arguments[Idx]->getType()); } +IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, + ArrayRef Args, + ArrayRef Tys, + FastMathFlags Flags, + const IntrinsicInst *I, + unsigned ScalarCost) + : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) { + ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end()); + Arguments.insert(Arguments.begin(), Args.begin(), Args.end()); +} + bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -731,40 +731,28 @@ if (ICA.isTypeBasedOnly()) return getTypeBasedIntrinsicInstrCost(ICA, CostKind); - Type *RetTy = ICA.getReturnType(); - unsigned VF = ICA.getVectorFactor().getFixedValue(); unsigned RetVF = (RetTy->isVectorTy() ? cast(RetTy)->getNumElements() : 1); - assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); // Assume that we need to scalarize this intrinsic. - SmallVector Types; - for (const Value *Op : Args) { - Type *OpTy = Op->getType(); - assert(VF == 1 || !OpTy->isVectorTy()); - Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF)); - } - - if (VF > 1 && !RetTy->isVoidTy()) - RetTy = FixedVectorType::get(RetTy, VF); // Compute the scalarization overhead based on Args for a vector // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while // CostModel will pass a vector RetTy and VF is 1. unsigned ScalarizationCost = std::numeric_limits::max(); - if (RetVF > 1 || VF > 1) { + if (RetVF > 1) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); - ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); + ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF); } - IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF, - ScalarizationCost, I); + IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I, + ScalarizationCost); return getIntrinsicInstrCost(Attrs, CostKind); } @@ -784,9 +772,20 @@ // TODO: Get more refined intrinsic costs? unsigned InstRate = getQuarterRateInstrCost(CostKind); - if (ICA.getID() == Intrinsic::fma) { + + switch (ICA.getID()) { + case Intrinsic::fma: InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) : getQuarterRateInstrCost(CostKind); + break; + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: + static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16}; + if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) + NElts = 1; + break; } return LT.first * NElts * InstRate; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1550,21 +1550,16 @@ case Intrinsic::usub_sat: { if (!ST->hasMVEIntegerOps()) break; - // Get the Return type, either directly of from ICA.ReturnType and ICA.VF. Type *VT = ICA.getReturnType(); - if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar()) - VT = VectorType::get(VT, ICA.getVectorFactor()); std::pair LT = TLI->getTypeLegalizationCost(DL, VT); if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || LT.second == MVT::v16i8) { - // This is a base cost of 1 for the vadd, plus 3 extract shifts if we + // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we // need to extend the type, as it uses shr(qadd(shl, shl)). - unsigned Instrs = LT.second.getScalarSizeInBits() == - ICA.getReturnType()->getScalarSizeInBits() - ? 1 - : 4; + unsigned Instrs = + LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4; return LT.first * ST->getMVEVectorCostFactor() * Instrs; } break; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3828,10 +3828,27 @@ InstructionCost LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, ElementCount VF) { + auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * { + if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) + return Elt; + return VectorType::get(Elt, VF); + }; + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - - IntrinsicCostAttributes CostAttrs(ID, *CI, VF); + Type *RetTy = MaybeVectorizeType(CI->getType(), VF); + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(CI)) + FMF = FPMO->getFastMathFlags(); + + SmallVector Arguments(CI->arg_begin(), CI->arg_end()); + FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); + SmallVector ParamTys; + std::transform(FTy->param_begin(), FTy->param_end(), ParamTys.begin(), + [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); + + IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, + dyn_cast(CI)); return TTI.getIntrinsicInstrCost(CostAttrs, TargetTransformInfo::TCK_RecipThroughput); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3417,7 +3417,16 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount()); + SmallVector VecTys; + for (Use &Arg : CI->args()) + VecTys.push_back( + FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); + FastMathFlags FMF; + if (auto *FPCI = dyn_cast(CI)) + FMF = FPCI->getFastMathFlags(); + SmallVector Arguments(CI->arg_begin(), CI->arg_end()); + IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF, + dyn_cast(CI)); auto IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); @@ -3428,11 +3437,6 @@ auto LibCost = IntrinsicCost; if (!CI->isNoBuiltin() && VecFunc) { // Calculate the cost of the vector library call. - SmallVector VecTys; - for (Use &Arg : CI->args()) - VecTys.push_back( - FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); - // If the corresponding vector call is cheaper, return its cost. LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys, TTI::TCK_RecipThroughput); @@ -3798,7 +3802,7 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1); + IntrinsicCostAttributes CostAttrs(ID, *CI, 1); InstructionCost ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -8,9 +8,9 @@ ; CHECK-COST-LABEL: sadd ; CHECK-COST: Found an estimated cost of 10 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 26 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 58 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 122 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* nocapture noalias %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @saddsat( @@ -21,29 +21,38 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 15 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934590 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576 ; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]] ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i16, i16* [[PSRC:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: [[IND_END4:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[OFFSET:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT9]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <2 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[WIDE_LOAD]], <2 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[NEXT_GEP5]] to <2 x i16>* -; CHECK-NEXT: store <2 x i16> [[TMP4]], <2 x i16>* [[TMP5]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, <8 x i16>* [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[NEXT_GEP6]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* [[TMP8]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[NEXT_GEP6]], i64 8 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP7]], <8 x i16>* [[TMP10]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] @@ -57,10 +66,10 @@ ; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP7]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i16, i16* [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i16 [[TMP8]], i16* [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP13]], i16* [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP2:!llvm.loop !.*]] @@ -90,10 +99,10 @@ ; CHECK-COST-LABEL: umin ; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 6 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 14 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 30 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 62 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset) define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @umin( @@ -107,78 +116,87 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 15 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 31 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934560 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[NEXT_GEP2]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[NEXT_GEP3]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* [[TMP8]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i64 16 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP10]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[CAST_CRD7:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD7]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 8 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[IND_END19:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[CAST_CRD12:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END13:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD12]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[BLOCKSIZE]], -1 -; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP8]], 1 -; CHECK-NEXT: [[N_VEC4:%.*]] = and i64 [[TMP9]], 8589934584 -; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC4]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[BLOCKSIZE]], -1 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], 1 +; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP14]], 8589934584 +; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC9]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]] -; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC4]] -; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]] +; CHECK-NEXT: [[IND_END18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX5]] -; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX5]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[NEXT_GEP17]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP10]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]]) -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[NEXT_GEP18]] to <8 x i8>* -; CHECK-NEXT: store <8 x i8> [[TMP11]], <8 x i8>* [[TMP12]], align 2 -; CHECK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX5]], 8 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP22:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX10]] +; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX10]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[NEXT_GEP22]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]]) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[NEXT_GEP23]] to <8 x i8>* +; CHECK-NEXT: store <8 x i8> [[TMP16]], <8 x i8>* [[TMP17]], align 2 +; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX10]], 8 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP9]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP14]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i8* [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i8* [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i8* [ [[IND_END15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END16]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8* [ [[IND_END18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL9]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP15:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP14]], i8 [[OFFSET]]) +; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP20:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP19]], i8 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i8 [[TMP15]], i8* [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i8 [[TMP20]], i8* [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP6:!llvm.loop !.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -157,7 +157,7 @@ ; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) ; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) ; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) -; CHECK-COST: Found an estimated cost of 1 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) +; CHECK-COST: Found an estimated cost of 4 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset) define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @cttz( diff --git a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll --- a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll +++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll @@ -4,20 +4,22 @@ ; Regression test for a bug in the SLP vectorizer that was causing ; these rotates to be incorrectly combined into a vector rotate. -; The bug fix is at https://reviews.llvm.org/D85759. This test has -; been pre-committed to demonstrate the regressed behavior and provide -; a clear diff for the bug fix. - target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" target triple = "wasm32-unknown-unknown" define void @foo(<2 x i64> %x, <4 x i32> %y, i64* %out) #0 { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[X:%.*]], <2 x i64> [[X]], <2 x i64> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[OUT:%.*]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 8 +; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0 +; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2 +; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[B]] to i64 +; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]]) +; CHECK-NEXT: store i64 [[C]], i64* [[OUT:%.*]], align 8 +; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1 +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3 +; CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[E]] to i64 +; CHECK-NEXT: [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]]) +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[OUT]], i32 1 +; CHECK-NEXT: store i64 [[F]], i64* [[ARRAYIDX2]], align 8 ; CHECK-NEXT: ret void ; %a = extractelement <2 x i64> %x, i32 0