diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -125,6 +125,21 @@ std::string ScalarName; /// Scalar Function Name. std::string VectorName; /// Vector Function Name associated to this VFInfo. VFISAKind ISA; /// Instruction Set Architecture. + + /// Returns the index of the first parameter with the kind 'GlobalPredicate', + /// if any exist. + std::optional getParamIndexForOptionalMask() const { + unsigned ParamCount = Shape.Parameters.size(); + for (unsigned i = 0; i < ParamCount; ++i) + if (Shape.Parameters[i].ParamKind == VFParamKind::GlobalPredicate) + return i; + + return std::nullopt; + } + + /// Returns true if at least one of the operands to the vectorized function + /// has the kind 'GlobalPredicate'. + bool isMasked() const { return getParamIndexForOptionalMask().has_value(); } }; namespace VFABI { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1593,7 +1593,8 @@ /// scalarized - /// i.e. either vector version isn't available, or is too expensive. InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize) const; + Function **Variant, + bool *NeedsMask = nullptr) const; /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B. @@ -3441,9 +3442,8 @@ } } -InstructionCost -LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize) const { +InstructionCost LoopVectorizationCostModel::getVectorCallCost( + CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; @@ -3475,18 +3475,35 @@ // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. - NeedToScalarize = true; + InstructionCost MaskCost = 0; VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we want an unmasked vector function but can't find one matching the VF, + // maybe we can find vector function that does use a mask and synthesize + // an all-true mask. + if (!VecFunc) { + Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); + VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we found one, add in the cost of creating a mask + if (VecFunc) { + if (NeedsMask) + *NeedsMask = true; + MaskCost = TTI.getShuffleCost( + TargetTransformInfo::SK_Broadcast, + VectorType::get( + IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()), + VF)); + } + } if (!TLI || CI->isNoBuiltin() || !VecFunc) return Cost; // If the corresponding vector cost is cheaper, return its cost. InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind); + TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; if (VectorCallCost < Cost) { - NeedToScalarize = false; + *Variant = VecFunc; Cost = VectorCallCost; } return Cost; @@ -7327,9 +7344,9 @@ if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) return *RedCost; - bool NeedToScalarize; + Function *Variant; CallInst *CI = cast(I); - InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant); if (getVectorIntrinsicIDForCall(CI, TLI)) { InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); return std::min(CallCost, IntrinsicCost); @@ -8332,8 +8349,8 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ArrayRef Operands, - VFRange &Range) const { - + VFRange &Range, + VPlanPtr &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI, VF); @@ -8350,17 +8367,17 @@ ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - ArrayRef Ops = Operands.take_front(CI->arg_size()); + SmallVector Ops(Operands.take_front(CI->arg_size())); // Is it beneficial to perform intrinsic call compared to lib call? bool ShouldUseVectorIntrinsic = ID && LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) -> bool { - bool NeedToScalarize = false; + Function *Variant; // Is it beneficial to perform intrinsic call compared to lib // call? InstructionCost CallCost = - CM.getVectorCallCost(CI, VF, NeedToScalarize); + CM.getVectorCallCost(CI, VF, &Variant); InstructionCost IntrinsicCost = CM.getVectorIntrinsicCost(CI, VF); return IntrinsicCost <= CallCost; @@ -8369,6 +8386,9 @@ if (ShouldUseVectorIntrinsic) return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); + Function *Variant = nullptr; + ElementCount VariantVF; + bool NeedsMask = false; // Is better to call a vectorized version of the function than to to scalarize // the call? auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8376,14 +8396,48 @@ // The following case may be scalarized depending on the VF. // The flag shows whether we can use a usual Call for vectorized // version of the instruction. - bool NeedToScalarize = false; - CM.getVectorCallCost(CI, VF, NeedToScalarize); - return !NeedToScalarize; + + // If we've found a variant at a previous VF, then stop looking. A + // vectorized variant of a function expects input in a certain shape + // -- basically the number of input registers, the number of lanes + // per register, and whether there's a mask required. + // We store a pointer to the variant in the VPWidenCallRecipe, so + // once we have an appropriate variant it's only valid for that VF. + // This will force a different vplan to be generated for each VF that + // finds a valid variant. + if (Variant) + return false; + CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask); + // If we found a valid vector variant at this VF, then store the VF + // in case we need to generate a mask. + if (Variant) + VariantVF = VF; + return Variant != nullptr; }, Range); - if (ShouldUseVectorCall) + if (ShouldUseVectorCall) { + if (NeedsMask) { + // If our vector variant requires a mask, then synthesize an all-true + // mask and insert it into the operands vector in the right place. + VPValue *Mask = Plan->getOrAddVPValue(ConstantInt::getTrue( + IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); + + VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true); + unsigned MaskPos = 0; + + for (VFInfo Info : VFDatabase::getMappings(*CI)) + if (Info.Shape == Shape) { + assert(Info.isMasked() && "Vector function info shape mismatch"); + MaskPos = Info.getParamIndexForOptionalMask().value(); + break; + } + + Ops.insert(Ops.begin() + MaskPos, Mask); + } + return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), - Intrinsic::not_intrinsic); + Intrinsic::not_intrinsic, Variant); + } return nullptr; } @@ -8654,7 +8708,7 @@ return nullptr; if (auto *CI = dyn_cast(Instr)) - return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); + return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); if (isa(Instr) || isa(Instr)) return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -95,7 +95,7 @@ /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same /// decision from \p Range.Start to \p Range.End. VPWidenCallRecipe *tryToWidenCall(CallInst *CI, ArrayRef Operands, - VFRange &Range) const; + VFRange &Range, VPlanPtr &Plan) const; /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe /// if it can. The function should only be called if the cost-model indicates diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -931,13 +931,19 @@ /// ID of the vector intrinsic to call when widening the call. If set the /// Intrinsic::not_intrinsic, a library call will be used instead. Intrinsic::ID VectorIntrinsicID; + /// If this recipe represents a library call, Variant stores a pointer to + /// the chosen function. There is a 1:1 mapping between a given VF and the + /// chosen vectorized variant, so there will be a different vplan for each + /// VF with a valid variant. + Function *Variant; public: template VPWidenCallRecipe(CallInst &I, iterator_range CallArguments, - Intrinsic::ID VectorIntrinsicID) + Intrinsic::ID VectorIntrinsicID, + Function *Variant = nullptr) : VPRecipeBase(VPDef::VPWidenCallSC, CallArguments), VPValue(this, &I), - VectorIntrinsicID(VectorIntrinsicID) {} + VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {} ~VPWidenCallRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -454,11 +454,6 @@ "DbgInfoIntrinsic should have been dropped during VPlan construction"); State.setDebugLocFromInst(&CI); - SmallVector Tys; - for (Value *ArgOperand : CI.args()) - Tys.push_back( - ToVectorTy(ArgOperand->getType(), State.VF.getKnownMinValue())); - for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector TysForDecl = {CI.getType()}; SmallVector Args; @@ -486,14 +481,12 @@ VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { - // Use vector version of the function call. - const VFShape Shape = VFShape::get(CI, State.VF, false /*HasGlobalPred*/); #ifndef NDEBUG - assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && - "Can't create vector function."); + assert(Variant != nullptr && "Can't create vector function."); #endif - VectorF = VFDatabase(CI).getVectorizedFunction(Shape); + VectorF = Variant; } + SmallVector OpBundles; CI.getOperandBundlesAsDefs(OpBundles); CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); @@ -525,8 +518,12 @@ if (VectorIntrinsicID) O << " (using vector intrinsic)"; - else - O << " (using library function)"; + else { + O << " (using library function"; + if (Variant->hasName()) + O << ": " << Variant->getName(); + O << ")"; + } } void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -10,39 +10,44 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_widen( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: +; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFNONE-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4 -; TFNONE-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; TFNONE-NEXT: [[TMP3:%.*]] = call i64 @foo(i64 [[TMP2]]) #[[ATTR2:[0-9]+]] -; TFNONE-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; TFNONE-NEXT: [[TMP5:%.*]] = call i64 @foo(i64 [[TMP4]]) #[[ATTR2]] -; TFNONE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 -; TFNONE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP5]], i32 1 -; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP8]], align 4 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TFNONE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; TFNONE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector( [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TFNONE: middle.block: -; TFNONE-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; TFNONE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; TFNONE: scalar.ph: -; TFNONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; TFNONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TFNONE-NEXT: br label [[FOR_BODY:%.*]] ; TFNONE: for.body: ; TFNONE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFNONE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; TFNONE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 -; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2]] +; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]] ; TFNONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] ; TFNONE-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 ; TFNONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; TFNONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 -; TFNONE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; TFNONE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; TFNONE: for.cond.cleanup: ; TFNONE-NEXT: ret void ; @@ -64,39 +69,44 @@ ; ; TFFALLBACK-LABEL: @test_widen( ; TFFALLBACK-NEXT: entry: -; TFFALLBACK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFFALLBACK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TFFALLBACK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; TFFALLBACK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFFALLBACK: vector.ph: +; TFFALLBACK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4 -; TFFALLBACK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @foo(i64 [[TMP2]]) #[[ATTR2:[0-9]+]] -; TFFALLBACK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; TFFALLBACK-NEXT: [[TMP5:%.*]] = call i64 @foo(i64 [[TMP4]]) #[[ATTR2]] -; TFFALLBACK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 -; TFFALLBACK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP5]], i32 1 -; TFFALLBACK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP8]], align 4 -; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TFFALLBACK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; TFFALLBACK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TFFALLBACK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; TFFALLBACK-NEXT: [[TMP5:%.*]] = call @foo_vector( [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TFFALLBACK: middle.block: -; TFFALLBACK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; TFFALLBACK: scalar.ph: -; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]] ; TFFALLBACK: for.body: ; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 -; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2]] +; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]] ; TFFALLBACK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 ; TFFALLBACK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; TFFALLBACK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 -; TFFALLBACK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; TFFALLBACK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; TFFALLBACK: for.cond.cleanup: ; TFFALLBACK-NEXT: ret void ; @@ -342,14 +352,14 @@ ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 -; TFNONE-NEXT: [[TMP6:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) -; TFNONE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP6]], ptr [[TMP7]], align 4 -; TFNONE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; TFNONE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TFNONE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) +; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TFNONE: middle.block: ; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; TFNONE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -401,14 +411,14 @@ ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 -; TFFALLBACK-NEXT: [[TMP6:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) -; TFFALLBACK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: store [[TMP6]], ptr [[TMP7]], align 4 -; TFFALLBACK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 -; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; TFFALLBACK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TFFALLBACK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TFFALLBACK-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) +; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TFFALLBACK: middle.block: ; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -466,14 +476,14 @@ ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 -; TFNONE-NEXT: [[TMP6:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) -; TFNONE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: store [[TMP6]], ptr [[TMP7]], align 4 -; TFNONE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; TFNONE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TFNONE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) +; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TFNONE: middle.block: ; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; TFNONE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -525,14 +535,14 @@ ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 -; TFFALLBACK-NEXT: [[TMP6:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) -; TFFALLBACK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: store [[TMP6]], ptr [[TMP7]], align 4 -; TFFALLBACK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 -; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; TFFALLBACK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TFFALLBACK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TFFALLBACK-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) +; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TFFALLBACK: middle.block: ; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -572,7 +582,7 @@ declare i64 @foo(i64) -; vector variants of foo +;; scalable vector variants of foo declare @foo_uniform(i64, ) declare @foo_vector(, ) declare @foo_vector_nomask() diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll @@ -0,0 +1,352 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +;; Given the choice between a masked and unmasked variant for the same VF (4) +;; where no mask is required, make sure we choose the unmasked variant. + +; CHECK-LABEL: LV: Checking a loop in 'test_v4_v4m' +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3> +; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: REPLICATE ir<%call> = call @foo(ir<%load>) +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3> +; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3> +; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed4_nomask) +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3> +; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +;; If we have a masked variant at one VF and an unmasked variant at a different +;; VF, ensure we create appropriate recipes (including a synthesized all-true +;; mask for the masked variant) + +; CHECK-LABEL: LV: Checking a loop in 'test_v2_v4m' +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3> +; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed2_nomask) +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3> +; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3> +; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>, ir) (using library function: foo_vector_fixed4_mask) +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3> +; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +;; If we have two variants at different VFs, neither of which are masked, we +;; still expect to see a different vplan per VF. + +; CHECK-LABEL: LV: Checking a loop in 'test_v2_v4' +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3> +; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed2_nomask) +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3> +; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<%3> +; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed4_nomask) +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%a>, vp<%3> +; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +define void @test_v4_v4m(ptr noalias %a, ptr readonly %b) #3 { +; CHECK-LABEL: @test_v4_v4m( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %gep = getelementptr i64, ptr %b, i64 %indvars.iv + %load = load i64, ptr %gep + %call = call i64 @foo(i64 %load) #0 + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + store i64 %call, ptr %arrayidx + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void + +} + +define void @test_v2_v4m(ptr noalias %a, ptr readonly %b) #3 { +; CHECK-LABEL: @test_v2_v4m( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_mask(<4 x i64> [[WIDE_LOAD]], <4 x i1> ) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %gep = getelementptr i64, ptr %b, i64 %indvars.iv + %load = load i64, ptr %gep + %call = call i64 @foo(i64 %load) #1 + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + store i64 %call, ptr %arrayidx + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void + +} + +define void @test_v2_v4(ptr noalias %a, ptr readonly %b) #3 { +; CHECK-LABEL: @test_v2_v4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %gep = getelementptr i64, ptr %b, i64 %indvars.iv + %load = load i64, ptr %gep + %call = call i64 @foo(i64 %load) #2 + %arrayidx = getelementptr inbounds i64, ptr %a, i64 %indvars.iv + store i64 %call, ptr %arrayidx + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void + +} + +declare i64 @foo(i64) + +;; fixed vector variants of foo +declare <2 x i64> @foo_vector_fixed2_nomask(<2 x i64>) +declare <4 x i64> @foo_vector_fixed4_nomask(<4 x i64>) +declare <4 x i64> @foo_vector_fixed4_mask(<4 x i64>, <4 x i1>) + +attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N4v_foo(foo_vector_fixed4_nomask),_ZGV_LLVM_M4v_foo(foo_vector_fixed4_mask)" } +attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_foo(foo_vector_fixed2_nomask),_ZGV_LLVM_M4v_foo(foo_vector_fixed4_mask)" } +attributes #2 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_foo(foo_vector_fixed2_nomask),_ZGV_LLVM_N4v_foo(foo_vector_fixed4_nomask)" } +attributes #3 = { "target-features"="+sve" vscale_range(2,16) "no-trapping-math"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -19,7 +19,7 @@ ; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr ir<%src>, vp<%3> ; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src> ; CHECK-NEXT: WIDEN ir<%conv> = fpext ir<%l> -; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function) +; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<%3> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> ; CHECK-NEXT: EMIT vp<%10> = VF * UF +(nuw) vp<%2>