diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -409,6 +409,9 @@ /// Widen a single instruction within the innermost loop. void widenInstruction(Instruction &I); + /// Widen a single call instruction within the innermost loop. + void widenCallInstruction(CallInst &I); + /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -4348,54 +4351,60 @@ } break; } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. +} - case Instruction::Call: { - // Ignore dbg intrinsics. - if (isa(I)) - break; - setDebugLocFromInst(Builder, &I); +void InnerLoopVectorizer::widenCallInstruction(CallInst &I) { + // Ignore dbg intrinsics. + if (isa(I)) + return; + setDebugLocFromInst(Builder, &I); - Module *M = I.getParent()->getParent()->getParent(); - auto *CI = cast(&I); + Module *M = I.getParent()->getParent()->getParent(); + auto *CI = cast(&I); - SmallVector Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + SmallVector Tys; + for (Value *ArgOperand : CI->arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize = false; + unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); + bool UseVectorIntrinsic = + ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; + assert((UseVectorIntrinsic || !NeedToScalarize) && + "Instruction should be scalarized elsewhere."); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - Value *Arg = CI->getArgOperand(i); - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) - Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); - Args.push_back(Arg); - } + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + Value *Arg = CI->getArgOperand(i); + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) + Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); + Args.push_back(Arg); + } - Function *VectorF; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - } else { - // Use vector version of the function call. - const VFShape Shape = - VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the function call. + const VFShape Shape = + VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); #ifndef NDEBUG const SmallVector Infos = VFDatabase::getMappings(*CI); assert(std::find_if(Infos.begin(), Infos.end(), @@ -4418,15 +4427,6 @@ VectorLoopValueMap.setVectorValue(&I, Part, V); addMetadata(V, &I); } - - break; - } - - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. } void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { @@ -6884,8 +6884,42 @@ return new VPBlendRecipe(Phi, Masks); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) { +VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(Instruction *I, + VFRange &Range) { + + bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + + CallInst *CI = dyn_cast(I); + if (IsPredicated || !CI) + return nullptr; + + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) + return nullptr; + + auto willWiden = [&](unsigned VF) -> bool { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + // The following case may be scalarized depending on the VF. + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); + bool UseVectorIntrinsic = + ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; + return UseVectorIntrinsic || !NeedToScalarize; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) + return nullptr; + + // Success: widen this call. + return new VPWidenCallRecipe(*CI); +} +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); @@ -6899,7 +6933,6 @@ case Instruction::AShr: case Instruction::BitCast: case Instruction::Br: - case Instruction::Call: case Instruction::FAdd: case Instruction::FCmp: case Instruction::FDiv: @@ -6941,29 +6974,10 @@ if (!IsVectorizableOpcode(I->getOpcode())) return nullptr; - if (CallInst *CI = dyn_cast(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) - return nullptr; - } - auto willWiden = [&](unsigned VF) -> bool { if (!isa(I) && (CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF))) return false; - if (CallInst *CI = dyn_cast(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; - return UseVectorIntrinsic || !NeedToScalarize; - } if (isa(I) || isa(I)) { assert(CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && @@ -7079,9 +7093,10 @@ return true; } - // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes. - if ((Recipe = tryToWiden(Instr, Range))) { + // Check if Instr is to be widened by a VPWidenCallRecipe or a general + // VPWidenRecipe, after having first checked for specific widening recipes. + if ((Recipe = tryToWidenCall(Instr, Range)) || + (Recipe = tryToWiden(Instr, Range))) { setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); return true; @@ -7371,6 +7386,10 @@ << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; } +void VPWidenCallRecipe::execute(VPTransformState &State) { + State.ILV->widenCallInstruction(Ingredient); +} + void VPWidenRecipe::execute(VPTransformState &State) { State.ILV->widenInstruction(Ingredient); } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -107,6 +107,7 @@ /// full if-conversion. VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan); + VPWidenCallRecipe *tryToWidenCall(Instruction *I, VFRange &Range); /// Check if \p I can be widened within the given VF \p Range. If \p I can be /// widened for \p Range.Start, build a new VPWidenRecipe and return it. /// Range.End may be decreased to ensure same decision from \p Range.Start to diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -614,6 +614,7 @@ VPWidenMemoryInstructionSC, VPWidenPHISC, VPWidenSC, + VPWidenCallSC, }; VPRecipeBase(const unsigned char SC) : SubclassID(SC) {} @@ -783,6 +784,29 @@ VPSlotTracker &SlotTracker) const override; }; +class VPWidenCallRecipe : public VPRecipeBase { +private: + /// Hold the instruction to be widened. + CallInst &Ingredient; + +public: + VPWidenCallRecipe(CallInst &I) : VPRecipeBase(VPWidenCallSC), Ingredient(I) {} + + ~VPWidenCallRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC; + } + + /// Produce widened copies of all Ingredients. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + /// A recipe for handling GEP instructions. class VPWidenGEPRecipe : public VPRecipeBase { private: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -712,6 +712,12 @@ O << DOT::EscapeString(IngredientString); } +void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << " +\n" << Indent << "\"WIDEN\\l\""; + O << "\" " << VPlanIngredient(&Ingredient) << "\\l\""; +} + void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << " +\n" << Indent << "\"WIDEN\\l\"";