diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -33,6 +33,7 @@ class PredicatedScalarEvolution; class ScalarEvolution; class SCEV; +class StoreInst; class DominatorTree; /// These are the kinds of recurrences that we support. @@ -74,14 +75,14 @@ public: RecurrenceDescriptor() = default; - RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K, - FastMathFlags FMF, Instruction *ExactFP, Type *RT, - bool Signed, bool Ordered, + RecurrenceDescriptor(Value *Start, Instruction *Exit, StoreInst *Store, + RecurKind K, FastMathFlags FMF, Instruction *ExactFP, + Type *RT, bool Signed, bool Ordered, SmallPtrSetImpl &CI, unsigned MinWidthCastToRecurTy) - : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF), - ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed), - IsOrdered(Ordered), + : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit), + Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT), + IsSigned(Signed), IsOrdered(Ordered), MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) { CastInsts.insert(CI.begin(), CI.end()); } @@ -168,22 +169,21 @@ /// RecurrenceDescriptor. If either \p DB is non-null or \p AC and \p DT are /// non-null, the minimal bit width needed to compute the reduction will be /// computed. - static bool AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop, - FastMathFlags FuncFMF, - RecurrenceDescriptor &RedDes, - DemandedBits *DB = nullptr, - AssumptionCache *AC = nullptr, - DominatorTree *DT = nullptr); + static bool + AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop, + FastMathFlags FuncFMF, RecurrenceDescriptor &RedDes, + DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr, + DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr); /// Returns true if Phi is a reduction in TheLoop. The RecurrenceDescriptor /// is returned in RedDes. If either \p DB is non-null or \p AC and \p DT are /// non-null, the minimal bit width needed to compute the reduction will be - /// computed. - static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, - RecurrenceDescriptor &RedDes, - DemandedBits *DB = nullptr, - AssumptionCache *AC = nullptr, - DominatorTree *DT = nullptr); + /// computed. If \p SE is non-null, store instructions to loop invariant + /// address are processed. + static bool + isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, + DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr, + DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr); /// Returns true if Phi is a first-order recurrence. A first-order recurrence /// is a non-reduction recurrence relation in which the value of the @@ -275,6 +275,9 @@ cast(I)->getIntrinsicID() == Intrinsic::fmuladd; } + /// Intermediate store of the reduction + StoreInst *IntermediateStore = nullptr; + private: // The starting value of the recurrence. // It does not have to be zero! diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -575,6 +575,11 @@ return HasDependenceInvolvingLoopInvariantAddress; } + /// Return the list of stores to invariant addresses. + const ArrayRef getStoresToInvariantAddresses() const { + return StoresToInvariantAddresses; + } + /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts /// them to a more usable form. All SCEV expressions during the analysis /// should be re-written (and therefore simplified) according to PSE. @@ -629,6 +634,9 @@ /// Indicator that there are non vectorizable stores to a uniform address. bool HasDependenceInvolvingLoopInvariantAddress = false; + /// List of stores to invariant addresses. + SmallVector StoresToInvariantAddresses; + /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. std::unique_ptr Report; diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -294,6 +294,13 @@ /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } + /// Returns True if given invariant store uses recurrent expression + bool isRecurringInvariantStore(StoreInst *SI); + + /// Returns True if given address is invariant and is used to store recurrent + /// expression + bool isRecurringInvariantAddress(Value *V); + /// Returns True if V is a Phi node of an induction variable in this loop. bool isInductionPhi(const Value *V) const; diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -237,12 +237,10 @@ return true; } -bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, - Loop *TheLoop, FastMathFlags FuncFMF, - RecurrenceDescriptor &RedDes, - DemandedBits *DB, - AssumptionCache *AC, - DominatorTree *DT) { +bool RecurrenceDescriptor::AddReductionVar( + PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF, + RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE) { if (Phi->getNumIncomingValues() != 2) return false; @@ -259,6 +257,12 @@ // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = nullptr; + + // Variable to keep last visited store instruction. By the end of the + // algorithm this variable will be either empty or having intermediate + // reduction value stored in invariant address. + StoreInst *IntermediateStore = nullptr; + // Indicates that we found a reduction operation in our scan. bool FoundReduxOp = false; @@ -324,6 +328,10 @@ // - By instructions outside of the loop (safe). // * One value may have several outside users, but all outside // uses must be of the same value. + // - By store instructions with a loop invariant address (safe with + // the following restrictions): + // * If there are several stores, all must have the same address. + // * Final value should be stored in that loop invariant address. // - By an instruction that is not part of the reduction (not safe). // This is either: // * An instruction type other than PHI or the reduction operation. @@ -331,6 +339,43 @@ while (!Worklist.empty()) { Instruction *Cur = Worklist.pop_back_val(); + // Store instructions are allowed iff it is the store of the reduction + // value to the same loop invariant memory location. + if (auto *SI = dyn_cast(Cur)) { + if (!SE) { + LLVM_DEBUG(dbgs() << "Store instructions are not processed without " + << "Scalar Evolution Analysis\n"); + return false; + } + + const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand()); + // Check it is the same address as previous stores + if (IntermediateStore) { + const SCEV *OtherScev = + SE->getSCEV(IntermediateStore->getPointerOperand()); + + if (OtherScev != PtrScev) { + LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses " + << "inside the loop: " << *SI->getPointerOperand() + << " and " + << *IntermediateStore->getPointerOperand() << '\n'); + return false; + } + } + + // Check the pointer is loop invariant + if (!SE->isLoopInvariant(PtrScev, TheLoop)) { + LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address " + << "inside the loop: " << *SI->getPointerOperand() + << '\n'); + return false; + } + + // IntermediateStore is always the last store in the loop. + IntermediateStore = SI; + continue; + } + // No Users. // If the instruction has no users then this is a broken chain and can't be // a reduction variable. @@ -453,10 +498,17 @@ // reductions which are represented as a cmp followed by a select. InstDesc IgnoredVal(false, nullptr); if (VisitedInsts.insert(UI).second) { - if (isa(UI)) + if (isa(UI)) { PHIs.push_back(UI); - else + } else { + StoreInst *SI = dyn_cast(UI); + if (SI && SI->getPointerOperand() == Cur) { + // Reduction variable chain can only be stored somewhere but it + // can't be used as an address. + return false; + } NonPHIs.push_back(UI); + } } else if (!isa(UI) && ((!isa(UI) && !isa(UI) && !isa(UI)) || @@ -484,6 +536,21 @@ if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1) return false; + // If there is an intermediate store, it must store the last reduction value. + if (ExitInstruction && IntermediateStore) { + if (IntermediateStore->getValueOperand() != ExitInstruction) { + LLVM_DEBUG( + dbgs() << "LU: Last store Instruction of reduction value " + << "does not store last calculated value of the reduction: " + << *IntermediateStore << '\n'); + return false; + } + } + // If all uses are inside the loop (intermediate stores), then the + // reduction value after the loop will be the one used in the last store. + if (!ExitInstruction && IntermediateStore) + ExitInstruction = cast(IntermediateStore->getValueOperand()); + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; @@ -545,9 +612,9 @@ // is saved as part of the RecurrenceDescriptor. // Save the description of this reduction variable. - RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst, - RecurrenceType, IsSigned, IsOrdered, CastInsts, - MinWidthCastToRecurrenceType); + RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind, + FMF, ExactFPMathInst, RecurrenceType, IsSigned, + IsOrdered, CastInsts, MinWidthCastToRecurrenceType); RedDes = RD; return true; @@ -771,7 +838,8 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC, - DominatorTree *DT) { + DominatorTree *DT, + ScalarEvolution *SE) { BasicBlock *Header = TheLoop->getHeader(); Function &F = *Header->getParent(); FastMathFlags FMF; @@ -780,72 +848,85 @@ FMF.setNoSignedZeros( F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool()); - if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC, - DT)) { + DT, SE)) { LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC, - DT)) { + DT, SE)) { LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI." << " PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, - DT)) { + if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n"); return true; } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1984,9 +1984,12 @@ for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); - if (isUniform(Ptr)) + if (isUniform(Ptr)) { + // Record store instructions to loop invariant addresses + StoresToInvariantAddresses.push_back(ST); HasDependenceInvolvingLoopInvariantAddress |= !UniformStores.insert(Ptr).second; + } // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -439,6 +439,26 @@ return false; } +/// Returns true if A and B have same pointer operands or same SCEVs addresses +static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, + StoreInst *B) { + // Compare store + if (A == B) + return true; + + // Otherwise Compare pointers + Value *APtr = A->getPointerOperand(); + Value *BPtr = B->getPointerOperand(); + if (APtr == BPtr) + return true; + + // Otherwise compare address SCEVs + if (SE->getSCEV(APtr) == SE->getSCEV(BPtr)) + return true; + + return false; +} + int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, Value *Ptr) const { const ValueToValueMap &Strides = @@ -676,7 +696,7 @@ RecurrenceDescriptor RedDes; if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, - DT)) { + DT, PSE.getSE())) { Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); AllowedExit.insert(RedDes.getLoopExitInstr()); Reductions[Phi] = RedDes; @@ -911,11 +931,54 @@ if (!LAI->canVectorizeMemory()) return false; - if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { - reportVectorizationFailure("Stores to a uniform address", - "write to a loop invariant address could not be vectorized", - "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); - return false; + // We can vectorize stores to invariant address when final reduction value is + // guaranteed to be stored at the end of the loop. Also, if decision to + // vectorize loop is made, runtime checks are added so as to make sure that + // invariant address won't alias with any other objects. + if (!LAI->getStoresToInvariantAddresses().empty()) { + // For each invariant address, check its last stored value is unconditional. + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isRecurringInvariantStore(SI) && + blockNeedsPredication(SI->getParent())) { + reportVectorizationFailure( + "We don't allow storing to uniform addresses", + "write of conditional recurring variant value to a loop " + "invariant address could not be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); + return false; + } + } + + if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { + // For each invariant address, check its last stored value is the result + // of one of our reductions. + // + // We do not check if dependence with loads exists because they are + // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this + // behaviour changes we have to modify this code. + ScalarEvolution *SE = PSE.getSE(); + SmallVector UnhandledStores; + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isRecurringInvariantStore(SI)) { + // Earlier stores to this address are effectively deadcode. + erase_if(UnhandledStores, [SE, SI](StoreInst *I) { + return storeToSameAddress(SE, SI, I); + }); + } else if (!isUniform(SI->getValueOperand())) + UnhandledStores.push_back(SI); + } + + bool IsOK = UnhandledStores.empty(); + // TODO: we should also validate against InvariantMemSets. + if (!IsOK) { + reportVectorizationFailure( + "We don't allow storing to uniform addresses", + "write to a loop invariant address could not " + "be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); + return false; + } + } } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); @@ -942,13 +1005,38 @@ // We can now only vectorize if all reductions with Exact FP math also // have the isOrdered flag set, which indicates that we can move the - // reduction operations in-loop. + // reduction operations in-loop, and do not have intermediate store. return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { const RecurrenceDescriptor &RdxDesc = Reduction.second; - return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); + return !RdxDesc.hasExactFPMath() || + (RdxDesc.isOrdered() && !RdxDesc.IntermediateStore); })); } +bool LoopVectorizationLegality::isRecurringInvariantStore(StoreInst *SI) { + for (auto &Reduction : Reductions) { + RecurrenceDescriptor DS = Reduction.second; + StoreInst *DSI = DS.IntermediateStore; + if (DSI && DSI == SI) + return true; + } + return false; +} + +bool LoopVectorizationLegality::isRecurringInvariantAddress(Value *V) { + ScalarEvolution *SE = PSE.getSE(); + for (auto &Reduction : Reductions) { + RecurrenceDescriptor DS = Reduction.second; + if (!DS.IntermediateStore) + continue; + Value *InvariantAddress = DS.IntermediateStore->getPointerOperand(); + if (V == InvariantAddress || + SE->getSCEV(V) == SE->getSCEV(InvariantAddress)) + return true; + } + return false; +} + bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { Value *In0 = const_cast(V); PHINode *PN = dyn_cast_or_null(In0); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4328,6 +4328,17 @@ // Set the resume value for this reduction ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); + // If there were stores of the reduction value to a uniform memory address + // inside the loop, create the final store here. + if (StoreInst *SI = RdxDesc.IntermediateStore) { + StoreInst *NewSI = + Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); + propagateMetadata(NewSI, SI); + + // If the reduction value is used in other places, + // then let the code below create PHI's for that. + } + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -7725,6 +7736,16 @@ // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + // Find all stores to invariant variables. Since they are going to sink + // outside the loop we do not need calculate cost for them. + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + StoreInst *SI; + if ((SI = dyn_cast(&I)) && + Legal->isRecurringInvariantAddress(SI->getPointerOperand())) + ValuesToIgnore.insert(&I); + } + // Ignore type-promoting instructions we identified during reduction // detection. for (auto &Reduction : Legal->getReductionVars()) { @@ -9151,6 +9172,13 @@ continue; } + // Invariant stores inside loop will be deleted and a single store + // with the final reduction value will be added to the exit block + StoreInst *SI; + if ((SI = dyn_cast(&I)) && + Legal->isRecurringInvariantAddress(SI->getPointerOperand())) + continue; + // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. VPBasicBlock *NextVPBB = diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1340,6 +1340,9 @@ getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; + if (RdxDesc->IntermediateStore) + O << " (with final reduction value stored in invariant address sank " + "outside of loop)"; } void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -320,10 +320,20 @@ ; ADD (with reduction stored in invariant address) -; CHECK-REMARK: loop not vectorized: value that could not be identified as reduction is used outside the loop +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2) define void @invariant_store(i32* %dst, i32* readonly %src) { ; CHECK-LABEL: @invariant_store -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK: %[[LOAD1:.*]] = load +; CHECK: %[[LOAD2:.*]] = load +; CHECK: %[[ADD1:.*]] = add %{{.*}}, %[[LOAD1]] +; CHECK: %[[ADD2:.*]] = add %{{.*}}, %[[LOAD2]] +; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[ADD1]] +; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[ADD2]] +; CHECK: middle.block: +; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] +; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[ADD]]) +; CHECK-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4 entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 store i32 0, i32* %gep.dst, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll @@ -11,7 +11,14 @@ ; dst[42] = sum; ; } ; CHECK-LABEL: @reduc_store -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK: phi <4 x i32> +; CHECK: load <4 x i32> +; CHECK: add <4 x i32> +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.dst +; CHECK: middle.block: +; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32 +; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst define void @reduc_store(i32* %dst, i32* readonly %src) { entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 @@ -41,7 +48,14 @@ ; dst[42] = sum; ; } ; CHECK-LABEL: @reduc_store_fadd_fast -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK: phi <4 x float> +; CHECK: load <4 x float> +; CHECK: fadd fast <4 x float> +; CHECK-NOT: store float %{{[0-9]+}}, float* %gep.dst +; CHECK: middle.block: +; CHECK-NEXT: [[TMP:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32 +; CHECK-NEXT: store float %{{[0-9]+}}, float* %gep.dst define void @reduc_store_fadd_fast(float* %dst, float* readonly %src) { entry: %gep.dst = getelementptr inbounds float, float* %dst, i64 42 @@ -152,7 +166,12 @@ ; dst[42] = sum; ; } ; CHECK-LABEL: @reduc_store_inside_unrolled -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.dst +; CHECK: middle.block: +; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32 +; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst +; CHECK: ret void define void @reduc_store_inside_unrolled(i32* %dst, i32* readonly %src) { entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 @@ -224,7 +243,12 @@ ; dst[42] = sum; ; } ; CHECK-LABEL: @reduc_store_middle_store_predicated -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.dst +; CHECK: middle.block: +; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32 +; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst +; CHECK: ret void define void @reduc_store_middle_store_predicated(i32* %dst, i32* readonly %src) { entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 @@ -308,7 +332,16 @@ ; } ; dst[43] = sum; ; CHECK-LABEL: @reduc_store_inoutside -; CHECK-NOT: vector.body +; CHECK: vector.body: +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.src +; CHECK: middle.block: +; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32 +; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst +; CHECK: exit: +; CHECK: %[[PHI:[a-zA-Z.0-9]+]] = phi i32 {{.*}} +; CHECK: %[[ADDR:[a-zA-Z.0-9]+]] = getelementptr inbounds i32, i32* %dst, i64 43 +; CHECK: store i32 %[[PHI]], i32* %[[ADDR]] +; CHECK: ret void define void @reduc_store_inoutside(i32* %dst, i32* readonly %src) { entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 @@ -331,3 +364,6 @@ store i32 %sum.lcssa, i32* %gep.dst.1, align 4 ret void } + +; Make sure any check-not directives are not triggered by function declarations. +; CHECK: declare diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -127,6 +127,44 @@ ret float %red.next } +define void @print_reduction_with_invariant_store(i64 %n, float* noalias %y, float* noalias %dst) { +; CHECK-LABEL: Checking a loop in "print_reduction_with_invariant_store" +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: for.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> +; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) (with final reduction value stored in invariant address sank outside of loop) +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF +(nuw) vp<[[CAN_IV]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %red = phi float [ %red.next, %for.body ], [ 0.0, %entry ] + %arrayidx = getelementptr inbounds float, float* %y, i64 %iv + %lv = load float, float* %arrayidx, align 4 + %red.next = fadd fast float %lv, %red + store float %red.next, float* %dst, align 4 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + define void @print_replicate_predicated_phi(i64 %n, i64* %x) { ; CHECK-LABEL: Checking a loop in "print_replicate_predicated_phi" ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {