diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -29,6 +29,7 @@ class PredicatedScalarEvolution; class ScalarEvolution; class SCEV; +class StoreInst; /// These are the kinds of recurrences that we support. enum class RecurKind { @@ -69,14 +70,14 @@ public: RecurrenceDescriptor() = default; - RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K, - FastMathFlags FMF, Instruction *ExactFP, Type *RT, - bool Signed, bool Ordered, + RecurrenceDescriptor(Value *Start, Instruction *Exit, StoreInst *Store, + RecurKind K, FastMathFlags FMF, Instruction *ExactFP, + Type *RT, bool Signed, bool Ordered, SmallPtrSetImpl &CI, unsigned MinWidthCastToRecurTy) - : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF), - ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed), - IsOrdered(Ordered), + : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit), + Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT), + IsSigned(Signed), IsOrdered(Ordered), MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) { CastInsts.insert(CI.begin(), CI.end()); } @@ -163,22 +164,21 @@ /// RecurrenceDescriptor. If either \p DB is non-null or \p AC and \p DT are /// non-null, the minimal bit width needed to compute the reduction will be /// computed. - static bool AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop, - FastMathFlags FuncFMF, - RecurrenceDescriptor &RedDes, - DemandedBits *DB = nullptr, - AssumptionCache *AC = nullptr, - DominatorTree *DT = nullptr); + static bool + AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop, + FastMathFlags FuncFMF, RecurrenceDescriptor &RedDes, + DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr, + DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr); /// Returns true if Phi is a reduction in TheLoop. The RecurrenceDescriptor /// is returned in RedDes. If either \p DB is non-null or \p AC and \p DT are /// non-null, the minimal bit width needed to compute the reduction will be - /// computed. - static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, - RecurrenceDescriptor &RedDes, - DemandedBits *DB = nullptr, - AssumptionCache *AC = nullptr, - DominatorTree *DT = nullptr); + /// computed. If \p SE is non-null, store instructions to loop invariant + /// addresses are processed. + static bool + isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, + DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr, + DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr); /// Returns true if Phi is a first-order recurrence. A first-order recurrence /// is a non-reduction recurrence relation in which the value of the @@ -270,6 +270,11 @@ cast(I)->getIntrinsicID() == Intrinsic::fmuladd; } + /// Reductions may store temporary or final result to an invariant address. + /// If there is such a store in the loop then, after successfull run of + /// AddReductionVar method, this field will be assigned the last met store. + StoreInst *IntermediateStore = nullptr; + private: // The starting value of the recurrence. // It does not have to be zero! diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -575,6 +575,11 @@ return HasDependenceInvolvingLoopInvariantAddress; } + /// Return the list of stores to invariant addresses. + const ArrayRef getStoresToInvariantAddresses() const { + return StoresToInvariantAddresses; + } + /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts /// them to a more usable form. All SCEV expressions during the analysis /// should be re-written (and therefore simplified) according to PSE. @@ -634,6 +639,9 @@ /// Indicator that there are non vectorizable stores to a uniform address. bool HasDependenceInvolvingLoopInvariantAddress = false; + /// List of stores to invariant addresses. + SmallVector StoresToInvariantAddresses; + /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. std::unique_ptr Report; diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -308,6 +308,14 @@ /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } + /// Returns True if given store is a final invariant store of one of the + /// reductions found in the loop. + bool isInvariantStoreOfReduction(StoreInst *SI); + + /// Returns True if given address is invariant and is used to store recurrent + /// expression + bool isInvariantAddressOfReduction(Value *V); + /// Returns True if V is a Phi node of an induction variable in this loop. bool isInductionPhi(const Value *V) const; diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -227,12 +227,10 @@ return true; } -bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, - Loop *TheLoop, FastMathFlags FuncFMF, - RecurrenceDescriptor &RedDes, - DemandedBits *DB, - AssumptionCache *AC, - DominatorTree *DT) { +bool RecurrenceDescriptor::AddReductionVar( + PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF, + RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE) { if (Phi->getNumIncomingValues() != 2) return false; @@ -249,6 +247,12 @@ // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = nullptr; + + // Variable to keep last visited store instruction. By the end of the + // algorithm this variable will be either empty or having intermediate + // reduction value stored in invariant address. + StoreInst *IntermediateStore = nullptr; + // Indicates that we found a reduction operation in our scan. bool FoundReduxOp = false; @@ -314,6 +318,10 @@ // - By instructions outside of the loop (safe). // * One value may have several outside users, but all outside // uses must be of the same value. + // - By store instructions with a loop invariant address (safe with + // the following restrictions): + // * If there are several stores, all must have the same address. + // * Final value should be stored in that loop invariant address. // - By an instruction that is not part of the reduction (not safe). // This is either: // * An instruction type other than PHI or the reduction operation. @@ -321,6 +329,43 @@ while (!Worklist.empty()) { Instruction *Cur = Worklist.pop_back_val(); + // Store instructions are allowed iff it is the store of the reduction + // value to the same loop invariant memory location. + if (auto *SI = dyn_cast(Cur)) { + if (!SE) { + LLVM_DEBUG(dbgs() << "Store instructions are not processed without " + << "Scalar Evolution Analysis\n"); + return false; + } + + const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand()); + // Check it is the same address as previous stores + if (IntermediateStore) { + const SCEV *OtherScev = + SE->getSCEV(IntermediateStore->getPointerOperand()); + + if (OtherScev != PtrScev) { + LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses " + << "inside the loop: " << *SI->getPointerOperand() + << " and " + << *IntermediateStore->getPointerOperand() << '\n'); + return false; + } + } + + // Check the pointer is loop invariant + if (!SE->isLoopInvariant(PtrScev, TheLoop)) { + LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address " + << "inside the loop: " << *SI->getPointerOperand() + << '\n'); + return false; + } + + // IntermediateStore is always the last store in the loop. + IntermediateStore = SI; + continue; + } + // No Users. // If the instruction has no users then this is a broken chain and can't be // a reduction variable. @@ -443,10 +488,17 @@ // reductions which are represented as a cmp followed by a select. InstDesc IgnoredVal(false, nullptr); if (VisitedInsts.insert(UI).second) { - if (isa(UI)) + if (isa(UI)) { PHIs.push_back(UI); - else + } else { + StoreInst *SI = dyn_cast(UI); + if (SI && SI->getPointerOperand() == Cur) { + // Reduction variable chain can only be stored somewhere but it + // can't be used as an address. + return false; + } NonPHIs.push_back(UI); + } } else if (!isa(UI) && ((!isa(UI) && !isa(UI) && !isa(UI)) || @@ -474,6 +526,32 @@ if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1) return false; + if (IntermediateStore) { + // Check that stored value goes to the phi node again. This way we make sure + // that the value stored in IntermediateStore is indeed the final reduction + // value. + if (!is_contained(Phi->operands(), IntermediateStore->getValueOperand())) { + LLVM_DEBUG(dbgs() << "Not a final reduction value stored: " + << *IntermediateStore << '\n'); + return false; + } + + // If there is an exit instruction it's value should be stored in + // IntermediateStore + if (ExitInstruction && + IntermediateStore->getValueOperand() != ExitInstruction) { + LLVM_DEBUG(dbgs() << "Last store Instruction of reduction value does not " + "store last calculated value of the reduction: " + << *IntermediateStore << '\n'); + return false; + } + + // If all uses are inside the loop (intermediate stores), then the + // reduction value after the loop will be the one used in the last store. + if (!ExitInstruction) + ExitInstruction = cast(IntermediateStore->getValueOperand()); + } + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; @@ -535,9 +613,9 @@ // is saved as part of the RecurrenceDescriptor. // Save the description of this reduction variable. - RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst, - RecurrenceType, IsSigned, IsOrdered, CastInsts, - MinWidthCastToRecurrenceType); + RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind, + FMF, ExactFPMathInst, RecurrenceType, IsSigned, + IsOrdered, CastInsts, MinWidthCastToRecurrenceType); RedDes = RD; return true; @@ -761,7 +839,8 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC, - DominatorTree *DT) { + DominatorTree *DT, + ScalarEvolution *SE) { BasicBlock *Header = TheLoop->getHeader(); Function &F = *Header->getParent(); FastMathFlags FMF; @@ -770,72 +849,85 @@ FMF.setNoSignedZeros( F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool()); - if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC, - DT)) { + DT, SE)) { LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC, - DT)) { + DT, SE)) { LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI." << " PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, - DT)) { + if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n"); return true; } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1993,9 +1993,12 @@ for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); - if (isUniform(Ptr)) + if (isUniform(Ptr)) { + // Record store instructions to loop invariant addresses + StoresToInvariantAddresses.push_back(ST); HasDependenceInvolvingLoopInvariantAddress |= !UniformStores.insert(Ptr).second; + } // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -441,6 +441,26 @@ return false; } +/// Returns true if A and B have same pointer operands or same SCEVs addresses +static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, + StoreInst *B) { + // Compare store + if (A == B) + return true; + + // Otherwise Compare pointers + Value *APtr = A->getPointerOperand(); + Value *BPtr = B->getPointerOperand(); + if (APtr == BPtr) + return true; + + // Otherwise compare address SCEVs + if (SE->getSCEV(APtr) == SE->getSCEV(BPtr)) + return true; + + return false; +} + int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, Value *Ptr) const { const ValueToValueMap &Strides = @@ -678,7 +698,7 @@ RecurrenceDescriptor RedDes; if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, - DT)) { + DT, PSE.getSE())) { Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); AllowedExit.insert(RedDes.getLoopExitInstr()); Reductions[Phi] = RedDes; @@ -913,11 +933,66 @@ if (!LAI->canVectorizeMemory()) return false; - if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { - reportVectorizationFailure("Stores to a uniform address", - "write to a loop invariant address could not be vectorized", - "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); - return false; + // We can vectorize stores to invariant address when final reduction value is + // guaranteed to be stored at the end of the loop. Also, if decision to + // vectorize loop is made, runtime checks are added so as to make sure that + // invariant address won't alias with any other objects. + if (!LAI->getStoresToInvariantAddresses().empty()) { + // For each invariant address, check its last stored value is unconditional. + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isInvariantStoreOfReduction(SI) && + blockNeedsPredication(SI->getParent())) { + reportVectorizationFailure( + "We don't allow storing to uniform addresses", + "write of conditional recurring variant value to a loop " + "invariant address could not be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); + return false; + } + } + + if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { + // For each invariant address, check its last stored value is the result + // of one of our reductions. + // + // We do not check if dependence with loads exists because they are + // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this + // behaviour changes we have to modify this code. + ScalarEvolution *SE = PSE.getSE(); + SmallVector UnhandledStores; + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isInvariantStoreOfReduction(SI)) { + // Earlier stores to this address are effectively deadcode. + // With opaque pointers it is possible for one pointer to be used with + // different sizes of stored values: + // store i32 0, ptr %x + // store i8 0, ptr %x + // The latest store doesn't complitely overwrite the first one in the + // example. That is why we have to make sure that types of stored + // values are same. + // TODO: Check that bitwidth of unhandled store is smaller then the + // one that overwrites it and add a test. + erase_if(UnhandledStores, [SE, SI](StoreInst *I) { + return storeToSameAddress(SE, SI, I) && + I->getValueOperand()->getType() == + SI->getValueOperand()->getType(); + }); + continue; + } + UnhandledStores.push_back(SI); + } + + bool IsOK = UnhandledStores.empty(); + // TODO: we should also validate against InvariantMemSets. + if (!IsOK) { + reportVectorizationFailure( + "We don't allow storing to uniform addresses", + "write to a loop invariant address could not " + "be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); + return false; + } + } } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); @@ -944,13 +1019,34 @@ // We can now only vectorize if all reductions with Exact FP math also // have the isOrdered flag set, which indicates that we can move the - // reduction operations in-loop. + // reduction operations in-loop, and do not have intermediate store. return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { const RecurrenceDescriptor &RdxDesc = Reduction.second; - return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); + return !RdxDesc.hasExactFPMath() || + (RdxDesc.isOrdered() && !RdxDesc.IntermediateStore); })); } +bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) { + return any_of(getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return RdxDesc.IntermediateStore == SI; + }); +} + +bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) { + return any_of(getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + if (!RdxDesc.IntermediateStore) + return false; + + ScalarEvolution *SE = PSE.getSE(); + Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand(); + return V == InvariantAddress || + SE->getSCEV(V) == SE->getSCEV(InvariantAddress); + }); +} + bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { Value *In0 = const_cast(V); PHINode *PN = dyn_cast_or_null(In0); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3998,6 +3998,17 @@ // Set the resume value for this reduction ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); + // If there were stores of the reduction value to a uniform memory address + // inside the loop, create the final store here. + if (StoreInst *SI = RdxDesc.IntermediateStore) { + StoreInst *NewSI = + Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); + propagateMetadata(NewSI, SI); + + // If the reduction value is used in other places, + // then let the code below create PHI's for that. + } + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -7340,6 +7351,16 @@ // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + // Find all stores to invariant variables. Since they are going to sink + // outside the loop we do not need calculate cost for them. + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + StoreInst *SI; + if ((SI = dyn_cast(&I)) && + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + ValuesToIgnore.insert(&I); + } + // Ignore type-promoting instructions we identified during reduction // detection. for (auto &Reduction : Legal->getReductionVars()) { @@ -8845,6 +8866,13 @@ continue; } + // Invariant stores inside loop will be deleted and a single store + // with the final reduction value will be added to the exit block + StoreInst *SI; + if ((SI = dyn_cast(&I)) && + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + continue; + // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. VPBasicBlock *NextVPBB = diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1420,6 +1420,9 @@ getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; + if (RdxDesc->IntermediateStore) + O << " (with final reduction value stored in invariant address sank " + "outside of loop)"; } void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -320,10 +320,20 @@ ; ADD (with reduction stored in invariant address) -; CHECK-REMARK: loop not vectorized: value that could not be identified as reduction is used outside the loop +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2) define void @invariant_store(i32* %dst, i32* readonly %src) { ; CHECK-LABEL: @invariant_store -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK: %[[LOAD1:.*]] = load +; CHECK: %[[LOAD2:.*]] = load +; CHECK: %[[ADD1:.*]] = add %{{.*}}, %[[LOAD1]] +; CHECK: %[[ADD2:.*]] = add %{{.*}}, %[[LOAD2]] +; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[ADD1]] +; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[ADD2]] +; CHECK: middle.block: +; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] +; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[ADD]]) +; CHECK-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4 entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 store i32 0, i32* %gep.dst, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll @@ -11,7 +11,23 @@ ; dst[42] = sum; ; } ; CHECK-LABEL: @reduc_store -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: store i32 [[TMP6]], i32* [[GEP_DST:%.*]], align 4 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]] define void @reduc_store(i32* %dst, i32* readonly %src) { entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 @@ -41,7 +57,14 @@ ; dst[42] = sum; ; } ; CHECK-LABEL: @reduc_store_fadd_fast -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK: phi <4 x float> +; CHECK: load <4 x float> +; CHECK: fadd fast <4 x float> +; CHECK-NOT: store float %{{[0-9]+}}, float* %gep.dst +; CHECK: middle.block: +; CHECK-NEXT: [[TMP:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32 +; CHECK-NEXT: store float %{{[0-9]+}}, float* %gep.dst define void @reduc_store_fadd_fast(float* %dst, float* readonly %src) { entry: %gep.dst = getelementptr inbounds float, float* %dst, i64 42 @@ -152,7 +175,55 @@ ; dst[42] = sum; ; } ; CHECK-LABEL: @reduc_store_inside_unrolled -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP19]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP21]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP23]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP25]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP28]], i32 2 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 3 +; CHECK-NEXT: [[TMP34]] = add <4 x i32> [[TMP33]], [[TMP16]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 +; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP34]]) +; CHECK-NEXT: store i32 [[TMP36]], i32* [[GEP_DST:%.*]], align 4 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 500, 500 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]] define void @reduc_store_inside_unrolled(i32* %dst, i32* readonly %src) { entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 @@ -160,7 +231,7 @@ for.body: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ] %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv %0 = load i32, i32* %gep.src, align 4 %sum.1 = add nsw i32 %0, %sum @@ -178,6 +249,38 @@ ret void } +; Check that we cannot vectorize code if stored value is not the final reduction +; value +; +; int sum = 0; +; for(int i=0; i < 1000; i++) { +; sum += src[i]; +; dst[42] = sum + 1; +; } +; CHECK-LABEL: @reduc_store_not_final_value +; CHECK-NOT: vector.body: +define void @reduc_store_not_final_value(i32* %dst, i32* readonly %src) { +entry: + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 + store i32 0, i32* %gep.dst, align 4 + br label %for.body + +for.body: + %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv + %0 = load i32, i32* %gep.src, align 4 + %add = add nsw i32 %sum, %0 + %sum_plus_one = add i32 %add, 1 + store i32 %sum_plus_one, i32* %gep.dst, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %exit, label %for.body + +exit: + ret void +} + ; We cannot vectorize if two (or more) invariant stores exist in a loop. ; ; int sum = 0; @@ -197,7 +300,7 @@ for.body: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ] %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv %0 = load i32, i32* %arrayidx, align 4 %sum.1 = add nsw i32 %0, %sum @@ -224,7 +327,12 @@ ; dst[42] = sum; ; } ; CHECK-LABEL: @reduc_store_middle_store_predicated -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.dst +; CHECK: middle.block: +; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32 +; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst +; CHECK: ret void define void @reduc_store_middle_store_predicated(i32* %dst, i32* readonly %src) { entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 @@ -299,6 +407,36 @@ ret void } +; Final reduction value is overwritten inside loop +; +; for(int i=0; i < 1000; i++) { +; sum += src[i]; +; dst[42] = sum; +; dst[42] = 0; +; } +; CHECK-LABEL: @reduc_store_final_store_overwritten +; CHECK-NOT: vector.body: +define void @reduc_store_final_store_overwritten(i32* %dst, i32* readonly %src) { +entry: + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 + br label %for.body + +for.body: + %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv + %0 = load i32, i32* %gep.src, align 4 + %add = add nsw i32 %sum, %0 + store i32 %add, i32* %gep.dst, align 4 + store i32 0, i32* %gep.dst, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %exit, label %for.body + +exit: + ret void +} + ; Final value used outside of loop does not prevent vectorization ; ; int sum = 0; @@ -308,7 +446,16 @@ ; } ; dst[43] = sum; ; CHECK-LABEL: @reduc_store_inoutside -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.src +; CHECK: middle.block: +; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32 +; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst +; CHECK: exit: +; CHECK: [[PHI:%.*]] = phi i32 [ [[TMP1:%.*]], %for.body ], [ [[TMP2:%.*]], %middle.block ] +; CHECK: [[ADDR:%.*]] = getelementptr inbounds i32, i32* %dst, i64 43 +; CHECK: store i32 [[PHI]], i32* [[ADDR]] +; CHECK: ret void define void @reduc_store_inoutside(i32* %dst, i32* readonly %src) { entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -146,6 +146,50 @@ ret float %red.next } +define void @print_reduction_with_invariant_store(i64 %n, float* noalias %y, float* noalias %dst) { +; CHECK-LABEL: Checking a loop in 'print_reduction_with_invariant_store' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> +; CHECK-NEXT: vp<[[IV:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<0>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, vp<[[IV]]> +; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) (with final reduction value stored in invariant address sank outside of loop) +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF +(nuw) vp<[[CAN_IV]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %red = phi float [ %red.next, %for.body ], [ 0.0, %entry ] + %arrayidx = getelementptr inbounds float, float* %y, i64 %iv + %lv = load float, float* %arrayidx, align 4 + %red.next = fadd fast float %lv, %red + store float %red.next, float* %dst, align 4 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + define void @print_replicate_predicated_phi(i64 %n, i64* %x) { ; CHECK-LABEL: Checking a loop in 'print_replicate_predicated_phi' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {