Index: ../lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- ../lib/Transforms/Vectorize/LoopVectorize.cpp +++ ../lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1608,12 +1608,6 @@ /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); - /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); } - - /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); } - /// Returns the information that we collected about runtime memory check. const RuntimePointerChecking *getRuntimePointerChecking() const { return LAI->getRuntimePointerChecking(); @@ -1690,15 +1684,9 @@ /// instructions that may divide by zero. bool isScalarWithPredication(Instruction *I); - /// Returns true if \p I is a memory instruction that has a consecutive or - /// consecutive-like pointer operand. Consecutive-like pointers are pointers - /// that are treated like consecutive pointers during vectorization. The - /// pointer operands of interleaved accesses are an example. - bool hasConsecutiveLikePtrOperand(Instruction *I); - - /// Returns true if \p I is a memory instruction that must be scalarized - /// during vectorization. - bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction with consecutive memory + /// access that can be widened. + bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); private: /// Check if a single basic block loop is vectorizable. @@ -1716,24 +1704,6 @@ /// transformation. bool canVectorizeWithIfConvert(); - /// Collect the instructions that are uniform after vectorization. An - /// instruction is uniform if we represent it with a single scalar value in - /// the vectorized loop corresponding to each vector iteration. Examples of - /// uniform instructions include pointer operands of consecutive or - /// interleaved memory accesses. Note that although uniformity implies an - /// instruction will be scalar, the reverse is not true. In general, a - /// scalarized instruction will be represented by VF scalar values in the - /// vectorized loop, each corresponding to an iteration of the original - /// scalar loop. - void collectLoopUniforms(); - - /// Collect the instructions that are scalar after vectorization. An - /// instruction is scalar if it is known to be uniform or will be scalarized - /// during vectorization. Non-uniform scalarized instructions will be - /// represented by VF values in the vectorized loop, each corresponding to an - /// iteration of the original scalar loop. - void collectLoopScalars(); - /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -1823,12 +1793,6 @@ /// vars which can be accessed from outside the loop. SmallPtrSet AllowedExit; - /// Holds the instructions known to be uniform after vectorization. - SmallPtrSet Uniforms; - - /// Holds the instructions known to be scalar after vectorization. - SmallPtrSet Scalars; - /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; @@ -1885,6 +1849,19 @@ unsigned selectInterleaveCount(bool OptForSize, unsigned VF, unsigned LoopCost); + /// Memory access instruction may be vectorized in more than one way. + /// Form of instruction after vectorization depends on cost. + /// This function takes cost-based decisions for Load/Store instructions + /// and collects them in a map. This decisions map is used for building + /// the lists of loop-uniform and loop-scalar instructions. + void setCostBasedWideningDecision(unsigned VF); + + /// Returns true if \p I is a memory instruction that has a consecutive or + /// consecutive-like pointer operand. Consecutive-like pointers are pointers + /// that are treated like consecutive pointers during vectorization. The + /// pointer operands of interleaved accesses are an example. + bool hasConsecutiveLikePtrOperand(Instruction *I, unsigned VF); + /// \brief A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { @@ -1919,11 +1896,68 @@ return Scalars->second.count(I); } + /// Returns true if \p I is known to be uniform after vectorization. + bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) + return true; + assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity"); + auto UniformsPerVF = Uniforms.find(VF); + return UniformsPerVF->second.count(I); + } + + /// Returns true if \p I is known to be scalar after vectorization. + bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { + if (VF == 1) + return true; + assert(Scalars.count(VF) && "Scalar values are not calculated for VF"); + auto ScalarsPerVF = Scalars.find(VF); + return ScalarsPerVF->second.count(I); + } + /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) && - !Legal->isScalarAfterVectorization(I); + !isScalarAfterVectorization(I, VF); + } + + /// Decision that was taken during cost calculation for memory instruction. + enum InstWidening { + CM_DECISION_NONE, + CM_DECISION_WIDEN, + CM_DECISION_INTERLEAVE, + CM_DECISION_GATHER_SCATTER, + CM_DECISION_SCALARIZE + }; + + /// Save vectorization decision \p W taken by the cost model for + /// instruction \p I and vector width \p VF. + void setWideningDecision(Instruction *I, unsigned VF, InstWidening W) { + assert(VF >= 2 && "Expected VF >=2"); + WideningDecisions[std::make_pair(I, VF)] = W; + } + + /// Save vectorization decision \p W taken by the cost model for + /// interleaving group \p Grp and vector width \p VF. + void setWideningDecision(const InterleaveGroup *Grp, unsigned VF, + InstWidening W) { + assert(VF >= 2 && "Expected VF >=2"); + /// Broadcast this decicion to all instructions inside the group. + for (unsigned i = 0; i < Grp->getFactor(); ++i) { + if (auto *I = Grp->getMember(i)) + WideningDecisions[std::make_pair(I, VF)] = W; + } + } + + /// Return the cost model decision for the given instruction \p I and vector + /// width \p VF. Return CM_DECISION_NONE if this instruction did not pass + /// through the cost modeling. + InstWidening getWideningDecision(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); + if (!WideningDecisions.count(InstOnVF)) + return CM_DECISION_NONE; + return WideningDecisions[InstOnVF]; } private: @@ -1950,6 +1984,12 @@ /// the vector type as an output parameter. unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + /// The cost computation for scalarized memory instruction. + unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + + /// The cost computation for interleaving group of memory instructions. + unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -1979,6 +2019,14 @@ /// vectorization factor. The entries are VF-ScalarCostTy pairs. DenseMap InstsToScalarize; + /// Holds the instructions known to be uniform after vectorization. + /// The data is collected per VF. + DenseMap> Uniforms; + + /// Holds the instructions known to be scalar after vectorization. + /// The data is collected per VF. + DenseMap> Scalars; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -1991,6 +2039,43 @@ /// the loop. void collectInstsToScalarize(unsigned VF); + /// Collect the instructions that are uniform after vectorization. An + /// instruction is uniform if we represent it with a single scalar value in + /// the vectorized loop corresponding to each vector iteration. Examples of + /// uniform instructions include pointer operands of consecutive or + /// interleaved memory accesses. Note that although uniformity implies an + /// instruction will be scalar, the reverse is not true. In general, a + /// scalarized instruction will be represented by VF scalar values in the + /// vectorized loop, each corresponding to an iteration of the original + /// scalar loop. + void collectLoopUniforms(unsigned VF); + + /// Collect the instructions that are scalar after vectorization. An + /// instruction is scalar if it is known to be uniform or will be scalarized + /// during vectorization. Non-uniform scalarized instructions will be + /// represented by VF values in the vectorized loop, each corresponding to an + /// iteration of the original scalar loop. + void collectLoopScalars(unsigned VF); + + /// Collect Uniform and Scalar values for the given \p VF. + /// The sets depend on CM decision for Load/Store instructions + /// that may be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(unsigned VF) { + // Do the analysis once. + if (VF == 1 || Uniforms.count(VF)) + return; + setCostBasedWideningDecision(VF); + collectLoopUniforms(VF); + collectLoopScalars(VF); + } + + /// Keeps cost model decisions for instructions. + /// Right now it is used for memory instructions only. + typedef DenseMap, InstWidening> + DecisionList; + + DecisionList WideningDecisions; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -2224,7 +2309,7 @@ } bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { - return Legal->isScalarAfterVectorization(I) || + return Cost->isScalarAfterVectorization(I, VF) || Cost->isProfitableToScalarize(I, VF); } @@ -2400,7 +2485,7 @@ // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Legal->isUniformAfterVectorization(cast(EntryVal)) ? 1 : VF; + Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 : VF; // Compute the scalar steps and save the results in VectorLoopValueMap. ScalarParts Entry(UF); @@ -2468,7 +2553,7 @@ // known to be uniform after vectorization, this corresponds to lane zero // of the last unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the last unroll iteration. - unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1; + unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; auto *LastInst = cast(getScalarValue(V, UF - 1, LastLane)); // Set the insert point after the last scalarized instruction. This ensures @@ -2485,7 +2570,7 @@ // VectorLoopValueMap, we will only generate the insertelements once. for (unsigned Part = 0; Part < UF; ++Part) { Value *VectorValue = nullptr; - if (Legal->isUniformAfterVectorization(I)) { + if (Cost->isUniformAfterVectorization(I, VF)) { VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0)); } else { VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); @@ -2514,8 +2599,9 @@ if (OrigLoop->isLoopInvariant(V)) return V; - assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast(V)) - : true && "Uniform values only have lane zero"); + assert(Lane > 0 ? + !Cost->isUniformAfterVectorization(cast(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested @@ -2815,8 +2901,11 @@ assert((LI || SI) && "Invalid Load/Store instruction"); - // Try to vectorize the interleave group if this access is interleaved. - if (Legal->isAccessInterleaved(Instr)) + LoopVectorizationCostModel::InstWidening Decision = + Cost->getWideningDecision(Instr, VF); + assert(Decision != LoopVectorizationCostModel::CM_DECISION_NONE && + "CM decision should be taken at this point"); + if (Decision == LoopVectorizationCostModel::CM_DECISION_INTERLEAVE) return vectorizeInterleaveGroup(Instr); Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); @@ -2831,17 +2920,15 @@ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); // Scalarize the memory instruction if necessary. - if (Legal->memoryInstructionMustBeScalarized(Instr, VF)) + if (Decision == LoopVectorizationCostModel::CM_DECISION_SCALARIZE) return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. bool CreateGatherScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr); + (Decision == LoopVectorizationCostModel::CM_DECISION_GATHER_SCATTER); VectorParts VectorGep; @@ -3026,7 +3113,7 @@ // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { @@ -4637,7 +4724,7 @@ // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF; + unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; // These are the scalar results. Notice that we don't generate vector GEPs // because scalar GEPs result in better code. ScalarParts Entry(UF); @@ -5141,12 +5228,6 @@ if (UseInterleaved) InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); - // Collect all instructions that are known to be uniform after vectorization. - collectLoopUniforms(); - - // Collect all instructions that are known to be scalar after vectorization. - collectLoopScalars(); - unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; @@ -5412,10 +5493,18 @@ return true; } -void LoopVectorizationLegality::collectLoopScalars() { +void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { + + // We should not collect Scalars more than once per VF. Right now, + // this function is called from collectUniformsAndScalars(), which + // already does this check. Collecting Scalars for VF=1 does not make any + // sense. + + assert(VF >= 2 && !Scalars.count(VF) && + "This function should not be visited twice for the same VF"); // If an instruction is uniform after vectorization, it will remain scalar. - Scalars.insert(Uniforms.begin(), Uniforms.end()); + Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); // Collect the getelementptr instructions that will not be vectorized. A // getelementptr instruction is only vectorized if it is used for a legal @@ -5423,21 +5512,21 @@ for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { if (auto *GEP = dyn_cast(&I)) { - Scalars.insert(GEP); + Scalars[VF].insert(GEP); continue; } auto *Ptr = getPointerOperand(&I); if (!Ptr) continue; auto *GEP = getGEPInstruction(Ptr); - if (GEP && isLegalGatherOrScatter(&I)) - Scalars.erase(GEP); + if (GEP && getWideningDecision(&I, VF) == CM_DECISION_GATHER_SCATTER) + Scalars[VF].erase(GEP); } // An induction variable will remain scalar if all users of the induction // variable and induction variable update remain scalar. auto *Latch = TheLoop->getLoopLatch(); - for (auto &Induction : *getInductionVars()) { + for (auto &Induction : *Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); @@ -5445,7 +5534,7 @@ // vectorization. auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast(U); - return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I); + return I == IndUpdate || !TheLoop->contains(I) || Scalars[VF].count(I); }); if (!ScalarInd) continue; @@ -5454,22 +5543,23 @@ // scalar after vectorization. auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast(U); - return I == Ind || !TheLoop->contains(I) || Scalars.count(I); + return I == Ind || !TheLoop->contains(I) || Scalars[VF].count(I); }); if (!ScalarIndUpdate) continue; // The induction variable and its update instruction will remain scalar. - Scalars.insert(Ind); - Scalars.insert(IndUpdate); + Scalars[VF].insert(Ind); + Scalars[VF].insert(IndUpdate); } } -bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) { - if (isAccessInterleaved(I)) +bool LoopVectorizationCostModel::hasConsecutiveLikePtrOperand(Instruction *I, + unsigned VF) { + if (VF > 1 && getWideningDecision(I, VF) == CM_DECISION_INTERLEAVE) return true; if (auto *Ptr = getPointerOperand(I)) - return isConsecutivePtr(Ptr); + return Legal->isConsecutivePtr(Ptr); return false; } @@ -5490,14 +5580,8 @@ return false; } -bool LoopVectorizationLegality::memoryInstructionMustBeScalarized( - Instruction *I, unsigned VF) { - - // If the memory instruction is in an interleaved group, it will be - // vectorized and its pointer will remain uniform. - if (isAccessInterleaved(I)) - return false; - +bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, + unsigned VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast(I); StoreInst *SI = dyn_cast(I); @@ -5507,31 +5591,40 @@ // will be scalarized. auto *Ptr = getPointerOperand(I); if (LI && isUniform(Ptr)) - return true; + return false; - // If the pointer operand is non-consecutive and neither a gather nor a - // scatter operation is legal, the memory instruction will be scalarized. - if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I)) - return true; + if (!isConsecutivePtr(Ptr)) + return false; // If the instruction is a store located in a predicated block, it will be // scalarized. if (isScalarWithPredication(I)) - return true; + return false; // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); if (hasIrregularType(ScalarTy, DL, VF)) - return true; + return false; - // Otherwise, the memory instruction should be vectorized if the rest of the - // loop is. - return false; + return true; } -void LoopVectorizationLegality::collectLoopUniforms() { +void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { + + // We should not collect Uniforms more than once per VF. Right now, + // this function is called from collectUniformsAndScalars(), which + // already does this check. Collecting Uniforms for VF=1 does not make any + // sense. + + assert(VF >= 2 && !Uniforms.count(VF) && + "This function should not be visited twice for the same VF"); + + // Visit the list of Uniforms. If we'll not find any uniform value, we'll + // not analyze again. Uniforms.count(VF) will return 1. + Uniforms[VF].clear(); + // We now know that the loop is vectorizable! // Collect instructions inside the loop that will remain uniform after // vectorization. @@ -5591,12 +5684,13 @@ // instruction other than a memory access, we're not going to check if // that other instruction may be scalarized here. Thus, conservatively // assume the pointer operand may be non-uniform. - if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I)) + if (!UsersAreMemAccesses || + getWideningDecision(&I, VF) == CM_DECISION_SCALARIZE) PossibleNonUniformPtrs.insert(Ptr); // If the memory instruction will be vectorized and its pointer operand // is consecutive-like, the pointer operand should remain uniform. - else if (hasConsecutiveLikePtrOperand(&I)) + else if (hasConsecutiveLikePtrOperand(&I, VF)) ConsecutiveLikePtrs.insert(Ptr); // Otherwise, if the memory instruction will be vectorized and its @@ -5639,7 +5733,8 @@ // Returns true if Ptr is the pointer operand of a memory access instruction // I, and I is known to not require scalarization. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I); + return getPointerOperand(I) == Ptr && + getWideningDecision(I, VF) != CM_DECISION_SCALARIZE; }; // For an instruction to be added into Worklist above, all its users inside @@ -5648,7 +5743,7 @@ // nodes separately. An induction variable will remain uniform if all users // of the induction variable and induction variable update remain uniform. // The code below handles both pointer and non-pointer induction variables. - for (auto &Induction : Inductions) { + for (auto &Induction : *Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); @@ -5679,7 +5774,7 @@ DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n"); } - Uniforms.insert(Worklist.begin(), Worklist.end()); + Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationLegality::canVectorizeMemory() { @@ -6192,6 +6287,8 @@ DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; + + collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); return Factor; } @@ -6558,12 +6655,13 @@ MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); continue; } - + collectUniformsAndScalars(VFs[j]); // Count the number of live intervals. unsigned RegUsage = 0; for (auto Inst : OpenIntervals) { // Skip ignored values for VF > 1. - if (VecValuesToIgnore.count(Inst)) + if (VecValuesToIgnore.count(Inst) || + isScalarAfterVectorization(Inst, VFs[j])) continue; RegUsage += GetRegUsage(Inst->getType(), VFs[j]); } @@ -6632,7 +6730,7 @@ Instruction *PredInst, DenseMap &ScalarCosts, unsigned VF) { - assert(!Legal->isUniformAfterVectorization(PredInst) && + assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the @@ -6653,7 +6751,7 @@ // already be scalar to avoid traversing chains that are unlikely to be // beneficial. if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || - Legal->isScalarAfterVectorization(I)) + isScalarAfterVectorization(I, VF)) return false; // If the instruction is scalar with predication, it will be analyzed @@ -6673,7 +6771,7 @@ // the lane zero values for uniforms rather than asserting. for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) - if (Legal->isUniformAfterVectorization(J)) + if (isUniformAfterVectorization(J, VF)) return false; // Otherwise, we can scalarize the instruction. @@ -6686,7 +6784,7 @@ // and their return values are inserted into vectors. Thus, an extract would // still be required. auto needsExtract = [&](Instruction *I) -> bool { - return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I); + return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF); }; // Compute the expected cost discount from scalarizing the entire expression @@ -6749,6 +6847,9 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect Uniform and Scalar instructions after vectorization with VF. + collectUniformsAndScalars(VF); + // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. collectInstsToScalarize(VF); @@ -6828,11 +6929,83 @@ Legal->hasStride(I->getOperand(1)); } +unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + unsigned VF) { + StoreInst *SI = dyn_cast(I); + LoadInst *LI = dyn_cast(I); + Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); + auto SE = PSE.getSE(); + + unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); + unsigned AS = + SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); + Value *Ptr = getPointerOperand(I); + Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + + // Figure out whether the access is strided and get the stride value + // if it's known in compile time + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); + + // Get the cost of the scalar memory instruction and address computation. + unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + + Cost += VF * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, + AS); + + // Get the overhead of the extractelement and insertelement instructions + // we might create due to scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // If we have a predicated store, it may not be executed for each vector + // lane. Scale the cost by the probability of executing the predicated + // block. + if (Legal->isScalarWithPredication(I)) + Cost /= getReciprocalPredBlockProb(); + + return Cost; +} + +unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, + unsigned VF) { + StoreInst *SI = dyn_cast(I); + LoadInst *LI = dyn_cast(I); + Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned AS = + SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); + + auto Group = Legal->getInterleavedAccessGroup(I); + assert(Group && "Fail to get an interleaved access group."); + + unsigned InterleaveFactor = Group->getFactor(); + Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + + // Holds the indices of existing members in an interleaved load group. + // An interleaved store group doesn't need this as it doesn't allow gaps. + SmallVector Indices; + if (LI) { + for (unsigned i = 0; i < InterleaveFactor; i++) + if (Group->getMember(i)) + Indices.push_back(i); + } + + // Calculate the cost of the whole interleaved group. + unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, + Group->getFactor(), Indices, + Group->getAlignment(), AS); + + if (Group->isReverse()) + Cost += Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. - if (Legal->isUniformAfterVectorization(I)) + if (isUniformAfterVectorization(I, VF)) VF = 1; if (VF > 1 && isProfitableToScalarize(I, VF)) @@ -6846,6 +7019,92 @@ return VectorizationCostTy(C, TypeNotScalarized); } +void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { + if (VF == 1) + return; + for (BasicBlock *BB : TheLoop->blocks()) { + VectorizationCostTy BlockCost; + + // For each instruction in the old loop. + for (Instruction &I : *BB) { + if (I.getOpcode() != Instruction::Store && + I.getOpcode() != Instruction::Load) + continue; + + StoreInst *SI = dyn_cast(&I); + LoadInst *LI = dyn_cast(&I); + + Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); + Type *VectorTy = ToVectorTy(ValTy, VF); + + unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); + Value *Ptr = getPointerOperand(&I); + + if (LI && Legal->isUniform(Ptr)) { + // Scalar load + broadcast + setWideningDecision(&I, VF, CM_DECISION_SCALARIZE); + continue; + } + + // We assume that widening is the best solution when possible. + if (Legal->memoryInstructionCanBeWidened(&I, VF)) { + setWideningDecision(&I, VF, CM_DECISION_WIDEN); + continue; + } + + // Choose between Interleaving, Gather/Scatter or Scalarization. + unsigned InterleaveGrpCost = 0; + unsigned InterleaveInstCost = (unsigned)(-1); + unsigned GatherScatterCost = (unsigned)(-1); + if (Legal->isAccessInterleaved(&I)) { + auto Group = Legal->getInterleavedAccessGroup(&I); + assert(Group && "Fail to get an interleaved access group."); + + // Make one decision for the whole group. + if (Group->getInsertPos() != &I) + continue; + + InterleaveGrpCost = getInterleaveGroupCost(&I, VF); + + if (InterleaveGrpCost / Group->getNumMembers() <= 1) { + // The interleaving cost is good enough. + setWideningDecision(Group, VF, CM_DECISION_INTERLEAVE); + continue; + } + // We are going to check all other options. + InterleaveInstCost = InterleaveGrpCost / Group->getNumMembers(); + } + + if (Legal->isLegalGatherOrScatter(&I)) { + GatherScatterCost = + TTI.getAddressComputationCost(VectorTy) + + TTI.getGatherScatterOpCost(I.getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(&I), Alignment); + } + unsigned ScalarizationCost = getMemInstScalarizationCost(&I, VF); + + // Choose better solution for the current VF, + // write down this decision and use it during vectorization. + InstWidening Decision; + if (InterleaveInstCost <= GatherScatterCost && + InterleaveInstCost <= ScalarizationCost) + Decision = CM_DECISION_INTERLEAVE; + else if (GatherScatterCost < InterleaveInstCost && + GatherScatterCost < ScalarizationCost) + Decision = CM_DECISION_GATHER_SCATTER; + else + Decision = CM_DECISION_SCALARIZE; + + // If the instructions belongs to an interleave group, the whole group + // receives the same decision. + if (auto Group = Legal->getInterleavedAccessGroup(&I)) + setWideningDecision(Group, VF, Decision); + else + setWideningDecision(&I, VF, Decision); + } + } +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy) { @@ -7000,12 +7259,15 @@ Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); return Cost + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy); + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } // For an interleaved access, calculate the total cost of the whole // interleave group. - if (Legal->isAccessInterleaved(I)) { + InstWidening Decision = getWideningDecision(I, VF); + assert(Decision != CM_DECISION_NONE && + "Widening decision should be taken at this moment"); + if (Decision == CM_DECISION_INTERLEAVE) { auto Group = Legal->getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); @@ -7037,14 +7299,11 @@ Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); - // FIXME: The interleaved load group with a huge gap could be even more - // expensive than scalar operations. Then we could ignore such group and - // use scalar operations instead. return Cost; } // Check if the memory instruction will be scalarized. - if (Legal->memoryInstructionMustBeScalarized(I, VF)) { + if (Decision == CM_DECISION_SCALARIZE) { unsigned Cost = 0; Type *PtrTy = ToVectorTy(Ptr->getType(), VF); @@ -7074,14 +7333,9 @@ // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); - bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. - bool UseGatherOrScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(I); unsigned Cost = TTI.getAddressComputationCost(VectorTy); - if (UseGatherOrScatter) { + if (Decision == CM_DECISION_GATHER_SCATTER) { assert(ConsecutiveStride == 0 && "Gather/Scatter are not used for consecutive stride"); return Cost + @@ -7089,12 +7343,15 @@ Legal->isMaskRequired(I), Alignment); } // Wide load/stores. + assert(Decision == CM_DECISION_WIDEN && ConsecutiveStride != 0 && + "Unexpected CM decision to handle"); if (Legal->isMaskRequired(I)) Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); else Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + bool Reverse = ConsecutiveStride < 0; if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); return Cost; @@ -7200,19 +7457,6 @@ SmallPtrSetImpl &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } - - // Insert values known to be scalar into VecValuesToIgnore. This is a - // conservative estimation of the values that will later be scalarized. - // - // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may - // still be scalarized. For example, we may find an instruction to be - // more profitable for a given vectorization factor if it were to be - // scalarized. But at this point, we haven't yet computed the - // vectorization factor. - for (auto *BB : TheLoop->getBlocks()) - for (auto &I : *BB) - if (Legal->isScalarAfterVectorization(&I)) - VecValuesToIgnore.insert(&I); } void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Index: ../test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll =================================================================== --- ../test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ ../test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -0,0 +1,37 @@ +; REQUIRES: asserts +; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s + +; This test shows extremely high interleaving cost that, probably, should be fixed. +; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize +; the load instructions. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%pair = type { i8, i8 } + +; CHECK-LABEL: test +; CHECK: Found an estimated cost of 10 for VF 2 For instruction: {{.*}} load i8 +; CHECK: vector.body +; CHECK: load i8 +; CHECK: load i8 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +define void @test(%pair* %p, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0 + %tmp1 = load i8, i8* %tmp0, align 1 + %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1 + %tmp3 = load i8, i8* %tmp2, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + Index: ../test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll =================================================================== --- ../test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ ../test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -18,8 +18,9 @@ ; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] ; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 ; CHECK: vector.body: +; CHECK: %index = phi i64 ; CHECK: %vec.ind = phi <16 x i64> -; CHECK: %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0 +; CHECK: %[[T0:.+]] = mul i64 %index, 5 ; CHECK: %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]] ; CHECK: %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>* ; CHECK: load <80 x float>, <80 x float>* %[[T2]], align 4 Index: ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll =================================================================== --- ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll +++ ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll @@ -0,0 +1,41 @@ +; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This test checks that "gather" operation is choosen since it's cost is better +; than interleaving pattern. +; +;unsigned long A[SIZE]; +;unsigned long B[SIZE]; +; +;void foo() { +; for (int i=0; i:1: ; preds = %0, %1 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv + %3 = load i64, i64* %2, align 16 + %4 = add i64 %3, 5 + %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %4, i64* %5, align 16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8 + %6 = icmp slt i64 %indvars.iv.next, 1024 + br i1 %6, label %1, label %7 + +;