diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -33,6 +33,7 @@ class PredicatedScalarEvolution; class ScalarEvolution; class SCEV; +class StoreInst; class DominatorTree; /// These are the kinds of recurrences that we support. @@ -74,13 +75,13 @@ public: RecurrenceDescriptor() = default; - RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K, - FastMathFlags FMF, Instruction *ExactFP, Type *RT, - bool Signed, bool Ordered, + RecurrenceDescriptor(Value *Start, Instruction *Exit, StoreInst *Store, + RecurKind K, FastMathFlags FMF, Instruction *ExactFP, + Type *RT, bool Signed, bool Ordered, SmallPtrSetImpl &CI) - : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF), - ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed), - IsOrdered(Ordered) { + : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit), + Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT), + IsSigned(Signed), IsOrdered(Ordered) { CastInsts.insert(CI.begin(), CI.end()); } @@ -167,7 +168,7 @@ /// non-null, the minimal bit width needed to compute the reduction will be /// computed. static bool AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop, - FastMathFlags FuncFMF, + FastMathFlags FuncFMF, ScalarEvolution *SE, RecurrenceDescriptor &RedDes, DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr, @@ -177,7 +178,7 @@ /// is returned in RedDes. If either \p DB is non-null or \p AC and \p DT are /// non-null, the minimal bit width needed to compute the reduction will be /// computed. - static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, + static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, ScalarEvolution *SE, RecurrenceDescriptor &RedDes, DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr, @@ -268,6 +269,9 @@ cast(I)->getIntrinsicID() == Intrinsic::fmuladd; } + /// Intermediate store of the reduction + StoreInst *IntermediateStore = nullptr; + private: // The starting value of the recurrence. // It does not have to be zero! diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -579,6 +579,11 @@ return HasDependenceInvolvingLoopInvariantAddress; } + /// Return the list of stores to invariant addresses. + const ArrayRef getStoresToInvariantAddresses() const { + return StoresToInvariantAddresses; + } + /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts /// them to a more usable form. All SCEV expressions during the analysis /// should be re-written (and therefore simplified) according to PSE. @@ -633,6 +638,9 @@ /// Indicator that there are non vectorizable stores to a uniform address. bool HasDependenceInvolvingLoopInvariantAddress; + /// List of stores to invariant addresses. + SmallVector StoresToInvariantAddresses; + /// The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. std::unique_ptr Report; diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -307,6 +307,13 @@ /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } + /// Returns True if given invariant store uses recurrent expression + bool isRecurringInvariantStore(StoreInst *SI); + + /// Returns True if given address is invariant and is used to store recurrent + /// expression + bool isRecurringInvariantAddress(Value *V); + /// Returns True if V is a Phi node of an induction variable in this loop. bool isInductionPhi(const Value *V); diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -225,12 +225,10 @@ return true; } -bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, - Loop *TheLoop, FastMathFlags FuncFMF, - RecurrenceDescriptor &RedDes, - DemandedBits *DB, - AssumptionCache *AC, - DominatorTree *DT) { +bool RecurrenceDescriptor::AddReductionVar( + PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF, + ScalarEvolution *SE, RecurrenceDescriptor &RedDes, DemandedBits *DB, + AssumptionCache *AC, DominatorTree *DT) { if (Phi->getNumIncomingValues() != 2) return false; @@ -247,6 +245,12 @@ // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = nullptr; + + // Variable to keep last visited store instruction. By the end of the + // algorithm this variable will be either empty or having intermediate + // reduction value stored in invariant address. + StoreInst *IntermediateStore = nullptr; + // Indicates that we found a reduction operation in our scan. bool FoundReduxOp = false; @@ -307,6 +311,10 @@ // - By instructions outside of the loop (safe). // * One value may have several outside users, but all outside // uses must be of the same value. + // - By store instructions with a loop invariant address (safe with + // the following restrictions): + // * If there are several stores, all must have the same address. + // * Final value should be stored in that loop invariant address. // - By an instruction that is not part of the reduction (not safe). // This is either: // * An instruction type other than PHI or the reduction operation. @@ -314,6 +322,37 @@ while (!Worklist.empty()) { Instruction *Cur = Worklist.pop_back_val(); + // Store instructions are allowed iff it is the store of the reduction + // value to the same loop invariant memory location. + if (auto *SI = dyn_cast(Cur)) { + const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand()); + // Check it is the same address as previous stores + if (IntermediateStore) { + const SCEV *OtherScev = + SE->getSCEV(IntermediateStore->getPointerOperand()); + + if (OtherScev != PtrScev) { + LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses " + << "inside the loop: " << *SI->getPointerOperand() + << " and " + << *IntermediateStore->getPointerOperand() << '\n'); + return false; + } + } + + // Check the pointer is loop invariant + if (!SE->isLoopInvariant(PtrScev, TheLoop)) { + LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address " + << "inside the loop: " << *SI->getPointerOperand() + << '\n'); + return false; + } + + // IntermediateStore is always the last store in the loop. + IntermediateStore = SI; + continue; + } + // No Users. // If the instruction has no users then this is a broken chain and can't be // a reduction variable. @@ -433,10 +472,17 @@ // reductions which are represented as a cmp followed by a select. InstDesc IgnoredVal(false, nullptr); if (VisitedInsts.insert(UI).second) { - if (isa(UI)) + if (isa(UI)) { PHIs.push_back(UI); - else + } else { + StoreInst *SI = dyn_cast(UI); + if (SI && SI->getPointerOperand() == Cur) { + // Reduction variable chain can only be stored somewhere but it + // can't be used as an address. + return false; + } NonPHIs.push_back(UI); + } } else if (!isa(UI) && ((!isa(UI) && !isa(UI) && !isa(UI)) || @@ -464,6 +510,21 @@ if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1) return false; + // If there is an intermediate store, it must store the last reduction value. + if (ExitInstruction && IntermediateStore) { + if (IntermediateStore->getValueOperand() != ExitInstruction) { + LLVM_DEBUG( + dbgs() << "LU: Last store Instruction of reduction value " + << "does not store last calculated value of the reduction: " + << *IntermediateStore << '\n'); + return false; + } + } + // If all uses are inside the loop (intermediate stores), then the + // reduction value after the loop will be the one used in the last store. + if (!ExitInstruction && IntermediateStore) + ExitInstruction = cast(IntermediateStore->getValueOperand()); + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; @@ -522,8 +583,8 @@ // is saved as part of the RecurrenceDescriptor. // Save the description of this reduction variable. - RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, - ReduxDesc.getExactFPMathInst(), RecurrenceType, + RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind, + FMF, ReduxDesc.getExactFPMathInst(), RecurrenceType, IsSigned, IsOrdered, CastInsts); RedDes = RD; @@ -746,6 +807,7 @@ } bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, + ScalarEvolution *SE, RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC, DominatorTree *DT) { @@ -757,71 +819,84 @@ FMF.setNoSignedZeros( F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool()); - if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC, - DT)) { + if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, SE, RedDes, DB, + AC, DT)) { LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, SE, RedDes, DB, AC, + DT)) { LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC, - DT)) { + if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, SE, RedDes, DB, + AC, DT)) { LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI." << " PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, + if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, SE, RedDes, DB, AC, DT)) { LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n"); return true; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1987,9 +1987,12 @@ for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); - if (isUniform(Ptr)) + if (isUniform(Ptr)) { + // Record store instructions to loop invariant addresses + StoresToInvariantAddresses.push_back(ST); HasDependenceInvolvingLoopInvariantAddress |= !UniformStores.insert(Ptr).second; + } // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -731,7 +731,7 @@ } // Check V's users to see if it is involved in a reduction in L. -static PHINode *findInnerReductionPhi(Loop *L, Value *V) { +static PHINode *findInnerReductionPhi(Loop *L, Value *V, ScalarEvolution *SE) { // Reduction variables cannot be constants. if (isa(V)) return nullptr; @@ -741,7 +741,9 @@ if (PHI->getNumIncomingValues() == 1) continue; RecurrenceDescriptor RD; - if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) + // Recurrence should not have intermediate store. + if (RecurrenceDescriptor::isReductionPHI(PHI, L, SE, RD) && + !RD.IntermediateStore) return PHI; return nullptr; } @@ -774,7 +776,7 @@ // Check if we have a PHI node in the outer loop that has a reduction // result from the inner loop as an incoming value. Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); - PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V); + PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V, SE); if (!InnerRedPhi || !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) { LLVM_DEBUG( diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -419,6 +419,26 @@ return false; } +/// Returns true if A and B have same pointer operands or same SCEVs addresses +static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, + StoreInst *B) { + // Compare store + if (A == B) + return true; + + // Otherwise Compare pointers + Value *APtr = A->getPointerOperand(); + Value *BPtr = B->getPointerOperand(); + if (APtr == BPtr) + return true; + + // Otherwise compare address SCEVs + if (SE->getSCEV(APtr) == SE->getSCEV(BPtr)) + return true; + + return false; +} + int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, Value *Ptr) const { const ValueToValueMap &Strides = @@ -655,8 +675,8 @@ } RecurrenceDescriptor RedDes; - if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, - DT)) { + if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, PSE.getSE(), + RedDes, DB, AC, DT)) { Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); AllowedExit.insert(RedDes.getLoopExitInstr()); Reductions[Phi] = RedDes; @@ -891,11 +911,50 @@ if (!LAI->canVectorizeMemory()) return false; + if (!LAI->getStoresToInvariantAddresses().empty()) { + // For each invariant address, check its last stored value is unconditional. + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isRecurringInvariantStore(SI) && + blockNeedsPredication(SI->getParent())) { + reportVectorizationFailure( + "We don't allow storing to uniform addresses", + "write of conditional recurring variant value to a loop " + "invariant address could not be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); + return false; + } + } + } + if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { - reportVectorizationFailure("Stores to a uniform address", - "write to a loop invariant address could not be vectorized", - "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); - return false; + // For each invariant address, check its last stored value is the result + // of one of our reductions. + // + // We do not check if dependence with loads exists because they are + // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this + // behaviour changes we have to modify this code. + ScalarEvolution *SE = PSE.getSE(); + SmallVector UnhandledStores; + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isRecurringInvariantStore(SI)) { + // Earlier stores to this address are effectively deadcode. + erase_if(UnhandledStores, [SE, SI](StoreInst *I) { + return storeToSameAddress(SE, SI, I); + }); + } else if (!isUniform(SI->getValueOperand())) + UnhandledStores.push_back(SI); + } + + bool IsOK = UnhandledStores.empty(); + // TODO: we should also validate against InvariantMemSets. + if (!IsOK) { + reportVectorizationFailure("We don't allow storing to uniform addresses", + "write to a loop invariant address could not " + "be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", + ORE, TheLoop); + return false; + } } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); @@ -922,13 +981,38 @@ // We can now only vectorize if all reductions with Exact FP math also // have the isOrdered flag set, which indicates that we can move the - // reduction operations in-loop. + // reduction operations in-loop, and do not have intermediate store. return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { const RecurrenceDescriptor &RdxDesc = Reduction.second; - return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); + return !RdxDesc.hasExactFPMath() || + (RdxDesc.isOrdered() && !RdxDesc.IntermediateStore); })); } +bool LoopVectorizationLegality::isRecurringInvariantStore(StoreInst *SI) { + for (auto &Reduction : Reductions) { + RecurrenceDescriptor DS = Reduction.second; + StoreInst *DSI = DS.IntermediateStore; + if (DSI && DSI == SI) + return true; + } + return false; +} + +bool LoopVectorizationLegality::isRecurringInvariantAddress(Value *V) { + ScalarEvolution *SE = PSE.getSE(); + for (auto &Reduction : Reductions) { + RecurrenceDescriptor DS = Reduction.second; + if (!DS.IntermediateStore) + continue; + Value *InvariantAddress = DS.IntermediateStore->getPointerOperand(); + if (V == InvariantAddress || + SE->getSCEV(V) == SE->getSCEV(InvariantAddress)) + return true; + } + return false; +} + bool LoopVectorizationLegality::isInductionPhi(const Value *V) { Value *In0 = const_cast(V); PHINode *PN = dyn_cast_or_null(In0); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4539,6 +4539,17 @@ BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + // If there were stores of the reduction value to a uniform memory address + // inside the loop, create the final store here. + if (StoreInst *SI = RdxDesc.IntermediateStore) { + StoreInst *NewSI = + Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); + propagateMetadata(NewSI, SI); + + // If the reduction value is used in other places, + // then let the code below create PHI's for that. + } + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -7919,6 +7930,16 @@ // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + // Find all stores to invariant variables. Since they are going to sink + // outside the loop we do not need calculate cost for them. + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + StoreInst *SI; + if ((SI = dyn_cast(&I)) && + Legal->isRecurringInvariantAddress(SI->getPointerOperand())) + ValuesToIgnore.insert(&I); + } + // Ignore type-promoting instructions we identified during reduction // detection. for (auto &Reduction : Legal->getReductionVars()) { @@ -9309,6 +9330,13 @@ continue; } + // Invariant stores will be either deleted or go outside of loop so there + // is no need to create a recipe for them. + StoreInst *SI; + if ((SI = dyn_cast(&I)) && + Legal->isRecurringInvariantAddress(SI->getPointerOperand())) + continue; + // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. VPBasicBlock *NextVPBB = diff --git a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll --- a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll +++ b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -151,6 +151,44 @@ ret i64 %sum.inc.lcssa2 } +; Check that we do not interchange if reduction is stored in an invariant address inside inner loop +; REMARKS: --- !Missed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: UnsupportedPHIOuter +; REMARKS-NEXT: Function: test4 + +define i64 @test4([100 x [100 x i64]]* %Arr, i64* %dst) { +entry: + %gep.dst = getelementptr inbounds i64, i64* %dst, i64 42 + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %sum.inner = phi i64 [ %sum.outer, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %sum.inner, %lv + store i64 %sum.inc, i64* %gep.dst, align 4 + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +} + ; Check that we do not interchange or crash if the PHI in the outer loop gets a ; constant from the inner loop. ; REMARKS: --- !Missed diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -318,6 +318,41 @@ ret float %.sroa.speculated } +; ADD (with reduction stored in invariant address) + +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2) +define void @invariant_store(i32* %dst, i32* readonly %src) { +; CHECK-LABEL: @invariant_store +; CHECK: vector.body: +; CHECK: %[[LOAD1:.*]] = load +; CHECK: %[[LOAD2:.*]] = load +; CHECK: %[[ADD1:.*]] = add %{{.*}}, %[[LOAD1]] +; CHECK: %[[ADD2:.*]] = add %{{.*}}, %[[LOAD2]] +; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[ADD1]] +; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[ADD2]] +; CHECK: middle.block: +; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] +; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[ADD]]) +; CHECK-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4 +entry: + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 + store i32 0, i32* %gep.dst, align 4 + br label %for.body +for.body: + %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %gep.src, align 4 + %add = add nsw i32 %sum, %0 + store i32 %add, i32* %gep.dst, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + ; Reduction cannot be vectorized ; MUL diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -1309,6 +1309,37 @@ declare float @llvm.fmuladd.f32(float, float, float) +; Test case with invariant store where fadd is strict. +define void @reduction_store_to_invariant_address(float* %dst, float* readonly %src) { +; CHECK-ORDERED-LABEL: @reduction_store_to_invariant_address( +; CHECK-ORDERED-NOT: vector.body + +; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address( +; CHECK-UNORDERED-NOT: vector.body + +; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address( +; CHECK-NOT-VECTORIZED-NOT: vector.body + +entry: + %arrayidx = getelementptr inbounds float, float* %dst, i64 42 + store float 0.000000e+00, float* %arrayidx, align 4 + br label %for.body + +for.body: + %0 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds float, float* %src, i64 %indvars.iv + %1 = load float, float* %arrayidx1, align 4 + %add = fadd float %0, %1 + store float %add, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + !0 = distinct !{!0, !5, !9, !11} !1 = distinct !{!1, !5, !10, !11} !2 = distinct !{!2, !6, !9, !11} diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll --- a/llvm/test/Transforms/LoopVectorize/reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction.ll @@ -462,6 +462,362 @@ ret i32 %inc.2 } +; This test checks that we can vectorize loop with reduction variable +; stored in an invariant address. +; +; int sum = 0; +; for(i=0..N) { +; sum += src[i]; +; dst[42] = sum; +; } +; CHECK-LABEL: @reduc_store + +; CHECK: vector.body: +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.dst +; CHECK: middle.block: +; CHECK: store i32 %{{[0-9]+}}, i32* %gep.dst +define void @reduc_store(i32* %dst, i32* readonly %src) { +entry: + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 + store i32 0, i32* %gep.dst, align 4 + br label %for.body +for.body: + %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %gep.src, align 4 + %add = add nsw i32 %sum, %0 + store i32 %add, i32* %gep.dst, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; Same as above but with floating point numbers instead. +; +; float sum = 0; +; for(i=0..N) { +; sum += src[i]; +; dst[42] = sum; +; } +; CHECK-LABEL: @reduc_store_fadd_fast +; CHECK: vector.body: +; CHECK-NOT: store float %{{[0-9]+}}, float* %gep.dst +; CHECK: middle.block: +; CHECK: store float %{{[0-9]+}}, float* %gep.dst +define void @reduc_store_fadd_fast(float* %dst, float* readonly %src) { +entry: + %gep.dst = getelementptr inbounds float, float* %dst, i64 42 + store float 0.000000e+00, float* %gep.dst, align 4 + br label %for.body + +for.body: + %sum = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %gep.src = getelementptr inbounds float, float* %src, i64 %indvars.iv + %0 = load float, float* %gep.src, align 4 + %add = fadd fast float %sum, %0 + store float %add, float* %gep.dst, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; Check that if we have a read from an invariant address, we do not vectorize. +; +; int sum = 0; +; for(i=0..N) { +; sum += src[i]; +; dst.2[i] = dst[42]; +; dst[42] = sum; +; } +; CHECK-LABEL: @reduc_store_load +; CHECK-NOT: vector.body: +define void @reduc_store_load(i32* %dst, i32* readonly %src, i32* noalias %dst.2) { +entry: + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 + store i32 0, i32* %gep.dst, align 4 + br label %for.body +for.body: + %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %gep.src, align 4 + %add = add nsw i32 %sum, %0 + %lv = load i32, i32* %gep.dst + %gep.dst.2 = getelementptr inbounds i32, i32* %dst.2, i64 %indvars.iv + store i32 %lv, i32* %gep.dst.2, align 4 + store i32 %add, i32* %gep.dst, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; Final value is not guaranteed to be stored in an invariant address. +; We don't vectorize in that case. +; +; int sum = 0; +; for(i=0..N) { +; int diff = y[i] - x[i]; +; if (diff > 0) { +; sum = += diff; +; *t = sum; +; } +; } +; CHECK-LABEL: @reduc_cond_store +; CHECK-NOT: vector.body +define void @reduc_cond_store(i32* %t, i32* readonly %x, i32* readonly %y, i32 %n) { +entry: + store i32 0, i32* %t, align 4 + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %if.end, %for.body.preheader + %sum = phi i32 [ 0, %for.body.preheader ], [ %sum.2, %if.end ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end ] + %gep.y = getelementptr inbounds i32, i32* %y, i64 %indvars.iv + %0 = load i32, i32* %gep.y, align 4 + %gep.x = getelementptr inbounds i32, i32* %x, i64 %indvars.iv + %1 = load i32, i32* %gep.x, align 4 + %diff = sub nsw i32 %0, %1 + %cmp2 = icmp sgt i32 %diff, 0 + br i1 %cmp2, label %if.then, label %if.end + +if.then: ; preds = %for.body + %sum.1 = add nsw i32 %diff, %sum + store i32 %sum.1, i32* %t, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %sum.2 = phi i32 [ %sum.1, %if.then ], [ %0, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %if.end, %entry + ret void +} + +; Check that we can vectorize code with several stores to an invariant address +; with condition that final reduction value is stored too. +; +; int sum = 0; +; for(int i=0; i < 1000; i+=2) { +; sum += src[i]; +; dst[42] = sum; +; sum += src[i+1]; +; dst[42] = sum; +; } +; CHECK-LABEL: @reduc_store_inside_unrolled +; CHECK: vector.body: +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.dst +; CHECK: middle.block: +; CHECK: store i32 %{{[0-9]+}}, i32* %gep.dst +; CHECK: ret void +define void @reduc_store_inside_unrolled(i32* %dst, i32* readonly %src) { +entry: + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %gep.src, align 4 + %sum.1 = add nsw i32 %0, %sum + store i32 %sum.1, i32* %gep.dst, align 4 + %1 = or i64 %indvars.iv, 1 + %gep.src.1 = getelementptr inbounds i32, i32* %src, i64 %1 + %2 = load i32, i32* %gep.src.1, align 4 + %sum.2 = add nsw i32 %2, %sum.1 + store i32 %sum.2, i32* %gep.dst, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1000 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; We cannot vectorize if two (or more) invariant stores exist in a loop. +; +; int sum = 0; +; for(int i=0; i < 1000; i+=2) { +; sum += src[i]; +; dst[42] = sum; +; sum += src[i+1]; +; other_dst[42] = sum; +; } +; CHECK-LABEL: @reduc_double_invariant_store +; CHECK-NOT: vector.body: +define void @reduc_double_invariant_store(i32* %dst, i32* %other_dst, i32* readonly %src) { +entry: + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 + %gep.other_dst = getelementptr inbounds i32, i32* %other_dst, i64 42 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %sum.1 = add nsw i32 %0, %sum + store i32 %sum.1, i32* %gep.dst, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx4 = getelementptr inbounds i32, i32* %src, i64 %1 + %2 = load i32, i32* %arrayidx4, align 4 + %sum.2 = add nsw i32 %2, %sum.1 + store i32 %sum.2, i32* %gep.other_dst, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1000 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; int sum = 0; +; for(int i=0; i < 1000; i+=2) { +; sum += src[i]; +; if (src[i+1] > 0) +; dst[42] = sum; +; sum += src[i+1]; +; dst[42] = sum; +; } +; CHECK-LABEL: @reduc_store_middle_store_predicated +; CHECK: vector.body: +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.dst +; CHECK: middle.block: +; CHECK: store i32 %{{[0-9]+}}, i32* %gep.dst +; CHECK: ret void +define void @reduc_store_middle_store_predicated(i32* %dst, i32* readonly %src) { +entry: + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 + br label %for.body + +for.cond.cleanup: ; preds = %latch + ret void + +for.body: ; preds = %latch, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %latch ] + %sum = phi i32 [ 0, %entry ], [ %sum.2, %latch ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %gep.src, align 4 + %sum.1 = add nsw i32 %0, %sum + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %predicated, label %latch + +predicated: ; preds = %for.body + store i32 %sum.1, i32* %gep.dst, align 4 + br label %latch + +latch: ; preds = %predicated, %for.body + %1 = or i64 %indvars.iv, 1 + %gep.src.1 = getelementptr inbounds i32, i32* %src, i64 %1 + %2 = load i32, i32* %gep.src.1, align 4 + %sum.2 = add nsw i32 %2, %sum.1 + store i32 %sum.2, i32* %gep.dst, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp.1 = icmp slt i64 %indvars.iv.next, 1000 + br i1 %cmp.1, label %for.body, label %for.cond.cleanup +} + +; int sum = 0; +; for(int i=0; i < 1000; i+=2) { +; sum += src[i]; +; dst[42] = sum; +; sum += src[i+1]; +; if (src[i+1] > 0) +; dst[42] = sum; +; } +; CHECK-LABEL: @reduc_store_final_store_predicated +; CHECK-NOT: vector.body: +define void @reduc_store_final_store_predicated(i32* %dst, i32* readonly %src) { +entry: + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 + br label %for.body + +for.cond.cleanup: ; preds = %latch + ret void + +for.body: ; preds = %latch, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %latch ] + %sum = phi i32 [ 0, %entry ], [ %sum.1, %latch ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %sum.1 = add nsw i32 %0, %sum + store i32 %sum.1, i32* %gep.dst, align 4 + %1 = or i64 %indvars.iv, 1 + %gep.src.1 = getelementptr inbounds i32, i32* %src, i64 %1 + %2 = load i32, i32* %gep.src.1, align 4 + %sum.2 = add nsw i32 %2, %sum.1 + %cmp1 = icmp sgt i32 %2, 0 + br i1 %cmp1, label %predicated, label %latch + +predicated: ; preds = %for.body + store i32 %sum.2, i32* %gep.dst, align 4 + br label %latch + +latch: ; preds = %predicated, %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1000 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Final value used outside of loop does not prevent vectorization +; +; int sum = 0; +; for(int i=0; i < 1000; i++) { +; sum += src[i]; +; dst[42] = sum; +; } +; dst[43] = sum; +; CHECK-LABEL: @reduc_store_inoutside +; CHECK: vector.body: +; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.src +; CHECK: middle.block: +; CHECK: store i32 %[[RDX:[0-9]+]], i32* %gep.src +; CHECK: for.cond.cleanup: +; CHECK: %[[PHI:[a-zA-Z.0-9]+]] = phi i32 {{.*}} %[[RDX]] +; CHECK: %[[ADDR:[a-zA-Z.0-9]+]] = getelementptr inbounds i32, i32* %dst, i64 43 +; CHECK: store i32 %[[PHI]], i32* %[[ADDR]] +; CHECK: ret void +define void @reduc_store_inoutside(i32* %dst, i32* readonly %src) { +entry: + %gep.src = getelementptr inbounds i32, i32* %dst, i64 42 + br label %for.body + +for.cond.cleanup: + %sum.lcssa = phi i32 [ %sum.1, %for.body ] + %gep.src.1 = getelementptr inbounds i32, i32* %dst, i64 43 + store i32 %sum.lcssa, i32* %gep.src.1, align 4 + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %sum.1 = add nsw i32 %0, %sum + store i32 %sum.1, i32* %gep.src, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + ;CHECK-LABEL: @reduction_sum_multiuse( ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32>