diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -506,7 +506,8 @@ /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc, VPValue *Def, + VPValue *CastDef, VPTransformState &State); /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a /// vector or scalar value on-demand if one is not yet available. When @@ -535,6 +536,10 @@ VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); } + void setScalarValue(Value *Scalar, const VPIteration &Instance, Value *V) { + VectorLoopValueMap.setScalarValue(Scalar, Instance, V); + } + /// Return a value in the new loop corresponding to \p V from the original /// loop at unroll and vector indices \p Instance. If the value has been /// vectorized but not scalarized, the necessary extractelement instruction @@ -544,6 +549,9 @@ /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); + void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, + VPTransformState &State); + /// Try to vectorize interleaved access group \p Group with the base address /// given in \p Addr, optionally masking the vector operations if \p /// BlockInMask is non-null. Use \p State to translate given VPValues to IR @@ -569,6 +577,13 @@ /// Fix the non-induction PHIs in the OrigPHIsToFix vector. void fixNonInductionPHIs(void); + /// Create a broadcast instruction. This method generates a broadcast + /// instruction (shuffle) for loop invariant values and for the induction + /// value. If this is the induction variable then we extend it to N, N+1, ... + /// this is needed because each iteration in the loop corresponds to a SIMD + /// element. + virtual Value *getBroadcastInstrs(Value *V); + protected: friend class LoopVectorizationPlanner; @@ -618,13 +633,6 @@ /// represented as. void truncateToMinimalBitwidths(); - /// Create a broadcast instruction. This method generates a broadcast - /// instruction (shuffle) for loop invariant values and for the induction - /// value. If this is the induction variable then we extend it to N, N+1, ... - /// this is needed because each iteration in the loop corresponds to a SIMD - /// element. - virtual Value *getBroadcastInstrs(Value *V); - /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) /// to each vector element of Val. The sequence starts at StartIndex. /// \p Opcode is relevant for FP induction variable. @@ -638,7 +646,8 @@ /// Note that \p EntryVal doesn't have to be an induction variable - it /// can also be a truncate instruction. void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, - const InductionDescriptor &ID); + const InductionDescriptor &ID, VPValue *Def, + VPValue *CastDef, VPTransformState &State); /// Create a vector induction phi node based on an existing scalar one. \p /// EntryVal is the value from the original loop that maps to the vector phi @@ -646,7 +655,9 @@ /// truncate instruction, instead of widening the original IV, we widen a /// version of the IV truncated to \p EntryVal's type. void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, - Value *Step, Instruction *EntryVal); + Value *Step, Instruction *EntryVal, + VPValue *Def, VPValue *CastDef, + VPTransformState &State); /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. @@ -673,11 +684,10 @@ /// latter case \p EntryVal is a TruncInst and we must not record anything for /// that IV, but it's error-prone to expect callers of this routine to care /// about that, hence this explicit parameter. - void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, - const Instruction *EntryVal, - Value *VectorLoopValue, - unsigned Part, - unsigned Lane = UINT_MAX); + void recordVectorLoopValueForInductionCast( + const InductionDescriptor &ID, const Instruction *EntryVal, + Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, + unsigned Part, unsigned Lane = UINT_MAX); /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -1819,7 +1829,8 @@ } void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( - const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { + const InductionDescriptor &II, Value *Step, Instruction *EntryVal, + VPValue *Def, VPValue *CastDef, VPTransformState &State) { assert((isa(EntryVal) || isa(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); Value *Start = II.getStartValue(); @@ -1874,11 +1885,13 @@ VecInd->setDebugLoc(EntryVal->getDebugLoc()); Instruction *LastInduction = VecInd; for (unsigned Part = 0; Part < UF; ++Part) { - VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); + // VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); + State.set(Def, EntryVal, LastInduction, Part); if (isa(EntryVal)) addMetadata(LastInduction, EntryVal); - recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); + recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, + State, Part); LastInduction = cast(addFastMathFlag( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); @@ -1914,7 +1927,8 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopVal, unsigned Part, unsigned Lane) { + Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, + unsigned Part, unsigned Lane) { assert((isa(EntryVal) || isa(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); @@ -1934,13 +1948,16 @@ // The rest of the Casts (if exist) have no uses outside the // induction update chain itself. Instruction *CastInst = *Casts.begin(); + assert(CastDef); if (Lane < UINT_MAX) - VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); + State.set(CastDef, CastInst, VectorLoopVal, {Part, Lane}); else - VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); + State.set(CastDef, CastInst, VectorLoopVal, Part); } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc, + VPValue *Def, VPValue *CastDef, + VPTransformState &State) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); @@ -2002,10 +2019,11 @@ Value *EntryPart = getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, ID.getInductionOpcode()); - VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); + State.set(Def, EntryVal, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); - recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); + recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, + State, Part); } }; @@ -2022,7 +2040,7 @@ // least one user in the loop that is not widened. auto NeedsScalarIV = needsScalarInduction(EntryVal); if (!NeedsScalarIV) { - createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, EntryVal, Def, CastDef, State); return; } @@ -2030,13 +2048,13 @@ // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, EntryVal); + createVectorIntOrFpInductionPHI(ID, Step, EntryVal, Def, CastDef, State); Value *ScalarIV = CreateScalarIV(Step); // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase // the number of instructions in the loop in the common case prior to // InstCombine. We will be trading one vector extract for each scalar step. - buildScalarSteps(ScalarIV, Step, EntryVal, ID); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); return; } @@ -2046,7 +2064,7 @@ Value *ScalarIV = CreateScalarIV(Step); if (!Cost->isScalarEpilogueAllowed()) CreateSplatIV(ScalarIV, Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); } Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, @@ -2107,7 +2125,9 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, - const InductionDescriptor &ID) { + const InductionDescriptor &ID, + VPValue *Def, VPValue *CastDef, + VPTransformState &State) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF.isVector() && "VF should be greater than one"); assert(!VF.isScalable() && @@ -2143,8 +2163,9 @@ ScalarIVTy, VF.getKnownMinValue() * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); - VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); + State.set(Def, Add, {Part, Lane}); + recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, + Part, Lane); } } } @@ -2273,6 +2294,16 @@ VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); } +void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, + const VPIteration &Instance, + VPTransformState &State) { + Value *ScalarInst = State.get(Def, Instance); + Value *VectorValue = State.get(Def, Instance.Part); + VectorValue = Builder.CreateInsertElement( + VectorValue, ScalarInst, State.Builder.getInt32(Instance.Lane)); + State.set(Def, VectorValue, Instance.Part); +} + Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); assert(!VF.isScalable() && "Cannot reverse scalable vectors"); @@ -7398,8 +7429,11 @@ // produces its scalar and vector values. InductionDescriptor II = Legal->getInductionVars().lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) - return new VPWidenIntOrFpInductionRecipe(Phi); + II.getKind() == InductionDescriptor::IK_FpInduction) { + const SmallVectorImpl &Casts = II.getCastInsts(); + return new VPWidenIntOrFpInductionRecipe( + Phi, Casts.empty() ? nullptr : Casts.front()); + } return nullptr; } @@ -7424,7 +7458,7 @@ if (LoopVectorizationPlanner::getDecisionAndClampRange( isOptimizableIVTruncate(I), Range)) return new VPWidenIntOrFpInductionRecipe(cast(I->getOperand(0)), - I); + nullptr, I); return nullptr; } @@ -8037,7 +8071,11 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, Trunc); + State.ILV->widenIntOrFpInduction( + IV, + IsTrunc ? cast(getVPValue(0)->getUnderlyingValue()) : nullptr, + getVPValue(0), getNumDefinedValues() == 2 ? getVPValue(1) : nullptr, + State); } void VPWidenPHIRecipe::execute(VPTransformState &State) { @@ -8261,12 +8299,70 @@ return CM_ScalarEpilogueAllowed; } +void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, + const VPIteration &Instance) { + set(Def, V, Instance); + ILV->setScalarValue(IRDef, Instance, V); +} + void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, unsigned Part) { set(Def, V, Part); ILV->setVectorValue(IRDef, Part, V); } +Value *VPTransformState::get(VPValue *Def, unsigned Part) { + // If Values have been set for this Def return the one relevant for \p Part. + if (hasVectorValue(Def, Part)) + return Data.PerPartOutput[Def][Part]; + + if (hasScalarValue(Def, {Part, 0})) { + Value *ScalarValue = get(Def, {Part, 0}); + // If we aren't vectorizing, we can just copy the scalar map values over + // to the vector map. + if (VF.isScalar()) { + set(Def, ScalarValue, Part); + return ScalarValue; + } + + auto *RepR = dyn_cast(Def); + bool IsUniform = RepR && RepR->isUniform(); + + unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; + auto *LastInst = cast(get(Def, {Part, LastLane})); + + // Set the insert point after the last scalarized instruction. This + // ensures the insertelement sequence will directly follow the scalar + // definitions. + auto OldIP = Builder.saveIP(); + auto NewIP = std::next(BasicBlock::iterator(LastInst)); + Builder.SetInsertPoint(&*NewIP); + + // However, if we are vectorizing, we need to construct the vector values. + // If the value is known to be uniform after vectorization, we can just + // broadcast the scalar value corresponding to lane zero for each unroll + // iteration. Otherwise, we construct the vector values using + // insertelement instructions. Since the resulting vectors are stored in + // VectorLoopValueMap, we will only generate the insertelements once. + Value *VectorValue = nullptr; + if (IsUniform) { + VectorValue = ILV->getBroadcastInstrs(ScalarValue); + set(Def, VectorValue, Part); + } else { + // Initialize packing with insertelements to start from undef. + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + Value *Undef = UndefValue::get(VectorType::get(LastInst->getType(), VF)); + set(Def, Undef, Part); + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) + ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); + VectorValue = get(Def, Part); + } + Builder.restoreIP(OldIP); + return VectorValue; + } + return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); +} + // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -280,13 +280,7 @@ /// method will delegate the call to ILV in such cases in order to provide /// callers a consistent API. /// \see set. - Value *get(VPValue *Def, unsigned Part) { - // If Values have been set for this Def return the one relevant for \p Part. - if (Data.PerPartOutput.count(Def)) - return Data.PerPartOutput[Def][Part]; - // Def is managed by ILV: bring the Values from ValueMap. - return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); - } + Value *get(VPValue *Def, unsigned Part); /// Get the generated Value for a given VPValue and given Part and Lane. Value *get(VPValue *Def, VPIteration Instance); @@ -948,15 +942,21 @@ /// producing their vector and scalar values. class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { PHINode *IV; - TruncInst *Trunc; + bool IsTrunc; public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) { - if (Trunc) + VPWidenIntOrFpInductionRecipe(PHINode *IV, Instruction *Cast, + TruncInst *Trunc = nullptr) + : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), IsTrunc(false) { + + if (Trunc) { new VPValue(VPValue::VPValueSubSC, Trunc, this); - else + IsTrunc = true; + } else new VPValue(VPValue::VPValueSubSC, IV, this); + + if (Cast) + new VPValue(VPValue::VPValueSubSC, Cast, this); } ~VPWidenIntOrFpInductionRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -893,10 +893,11 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << "\"WIDEN-INDUCTION"; - if (Trunc) { + if (IsTrunc) { O << "\\l\""; O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc); + O << " +\n" << Indent << "\" "; + getVPValue(0)->printAsOperand(O, SlotTracker); } else O << " " << VPlanIngredient(IV); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -68,7 +68,7 @@ InductionDescriptor II = Inductions.lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || II.getKind() == InductionDescriptor::IK_FpInduction) { - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, nullptr); } else NewRecipe = new VPWidenPHIRecipe(Phi); } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) {