Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -841,8 +841,9 @@ class InterleavedAccessInfo { public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, - DominatorTree *DT) - : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {} + DominatorTree *DT, LoopInfo *LI) + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr), + RequiresScalarEpilogue(false) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -883,6 +884,9 @@ /// out-of-bounds requires a scalar epilogue iteration for correctness. bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } + /// \brief Initialize the LoopAccessInfo used for dependence checking. + void setLAI(const LoopAccessInfo *Info) { LAI = Info; } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. @@ -891,6 +895,8 @@ PredicatedScalarEvolution &PSE; Loop *TheLoop; DominatorTree *DT; + LoopInfo *LI; + const LoopAccessInfo *LAI; /// True if the loop may contain non-reversed interleaved groups with /// out-of-bounds accesses. We ensure we don't speculatively access memory @@ -900,6 +906,10 @@ /// Holds the relationships between the members and the interleave group. DenseMap InterleaveGroupMap; + /// Holds dependences among the memory accesses in the loop. It maps a source + /// access to a set of dependent sink accesses. + DenseMap> Dependences; + /// \brief The descriptor for a strided memory access. struct StrideDescriptor { StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, @@ -914,6 +924,9 @@ unsigned Align; // The alignment of this access. }; + /// \brief A type for holding instructions and their stride descriptors. + typedef std::pair StrideEntry; + /// \brief Create a new interleave group with the given instruction \p Instr, /// stride \p Stride and alignment \p Align. /// @@ -939,6 +952,76 @@ void collectConstStridedAccesses( MapVector &StrideAccesses, const ValueToValueMap &Strides); + + /// \brief Returns true if \p Stride is allowed in an interleaved group. + static bool isStrided(int Stride) { + unsigned Factor = std::abs(Stride); + return Factor >= 2 && Factor <= MaxInterleaveGroupFactor; + } + + /// \brief Returns true if LoopAccessInfo can be used for dependence queries. + bool areDependencesValid() const { + return LAI && LAI->getDepChecker().getDependences(); + } + + /// \brief Returns true if memory accesses \p A and \p B can be reordered. + /// + /// At least one of the accesses should be strided, and \p A must precede \B + /// in program order. + bool canReorderMemAccesses(StrideEntry *A, StrideEntry *B) const { + + // Code motion for interleaved accesses can potentially hoist strided loads + // and sink strided stores. The code below checks the legality of the + // following two conditions: + // + // 1. Potentially moving a strided load (B) before any store (A) that + // precedes B, or + // + // 2. Potentially moving a strided store (A) after any load or store (B) + // that A precedes. + // + // It's legal to reorder A and B if we know there isn't a dependence from A + // to B. Note that this determination is conservative since some + // dependences could potentially be reordered safely. + + // A is potentially the source of a dependence. + auto *Src = A->first; + auto SrcDes = A->second; + + // B is potentially the sink of a dependence. + auto *Sink = B->first; + auto SinkDes = B->second; + + // Code motion for interleaved accesses can't violate WAR dependences. + // Thus, reordering is legal if the source isn't a write. + if (!Src->mayWriteToMemory()) + return true; + + // At least one of the accesses must be strided. + if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride)) + return true; + + // If dependence information is not available from LoopAccessInfo, + // conservatively assume the instructions can't be reordered. + if (!areDependencesValid()) + return false; + + // If we know there is a dependence from source to sink, assume the + // instructions can't be reordered. Otherwise, reordering is legal. + return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink); + } + + /// \brief Collect the dependences from LoopAccessInfo. + /// + /// We process the dependences once during the interleaved access analysis to + /// enable constant-time dependence queries. + void collectDependences() { + if (!areDependencesValid()) + return; + auto *Deps = LAI->getDepChecker().getDependences(); + for (auto Dep : *Deps) + Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI)); + } }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -1269,13 +1352,14 @@ DominatorTree *DT, TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, - LoopAccessAnalysis *LAA, + LoopAccessAnalysis *LAA, LoopInfo *LI, LoopVectorizationRequirements *R, LoopVectorizeHints *H) : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), - Requirements(R), Hints(H) {} + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), + InterleaveInfo(PSE, L, DT, LI), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), + Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1887,7 +1971,7 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, LI, &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); @@ -4932,6 +5016,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { LAI = &LAA->getInfo(TheLoop, Strides); + InterleaveInfo.setLAI(LAI); auto &OptionalReport = LAI->getReport(); if (OptionalReport) emitAnalysis(VectorizationReport(*OptionalReport)); @@ -5046,7 +5131,17 @@ // Holds load/store instructions in program order. SmallVector AccessList; - for (auto *BB : TheLoop->getBlocks()) { + // Since it's desired that the load/store instructions be maintained in + // "program order" for the interleaved access analysis, we have to visit the + // blocks in the loop in reverse postorder (i.e., in a topological order). + // Such an ordering will ensure that any load/store that may be executed + // before a second load/store will precede the second load/store in the + // AccessList. + LoopBlocksDFS DFS(TheLoop); + DFS.perform(LI); + for (LoopBlocksDFS::RPOIterator I = DFS.beginRPO(), E = DFS.endRPO(); I != E; + ++I) { + BasicBlock *BB = *I; bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); for (auto &I : *BB) { @@ -5071,13 +5166,6 @@ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides); - // The factor of the corresponding interleave group. - unsigned Factor = std::abs(Stride); - - // Ignore the access if the factor is too small or too large. - if (Factor < 2 || Factor > MaxInterleaveGroupFactor) - continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -5091,26 +5179,29 @@ } } -// Analyze interleaved accesses and collect them into interleave groups. +// Analyze interleaved accesses and collect them into interleaved load and +// store groups. +// +// When generating code for an interleaved load group, we effectively hoist all +// loads in the group to the location of the first load in program order. When +// generating code for an interleaved store group, we sink all stores to the +// location of the last store. This code motion can change the order of load +// and store instructions and may break dependences. // -// Notice that the vectorization on interleaved groups will change instruction -// orders and may break dependences. But the memory dependence check guarantees -// that there is no overlap between two pointers of different strides, element -// sizes or underlying bases. +// The code generation strategy mentioned above ensures that we won't violate +// any write-after-read (WAR) dependences. // -// For pointers sharing the same stride, element size and underlying base, no -// need to worry about Read-After-Write dependences and Write-After-Read -// dependences. +// E.g., for the WAR dependence: a = A[i]; // (1) +// A[i] = b; // (2) // -// E.g. The RAW dependence: A[i] = a; -// b = A[i]; -// This won't exist as it is a store-load forwarding conflict, which has -// already been checked and forbidden in the dependence check. +// The store group of (2) is always inserted at or below (2), and the load +// group of (1) is always inserted at or above (1). Thus, the instructions will +// never be reordered. All other dependences are checked to ensure the +// correctness of the instruction reordering. // -// E.g. The WAR dependence: a = A[i]; // (1) -// A[i] = b; // (2) -// The store group of (2) is always inserted at or below (2), and the load group -// of (1) is always inserted at or above (1). The dependence is safe. +// The algorithm visits all memory accesses in the loop in bottom-up program +// order. Program order is established by traversing the blocks in the loop in +// reverse postorder when collecting the accesses. void InterleavedAccessInfo::analyzeInterleaving( const ValueToValueMap &Strides) { DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); @@ -5122,6 +5213,9 @@ if (StrideAccesses.empty()) return; + // Collect the dependences in the loop. + collectDependences(); + // Holds all interleaved store groups temporarily. SmallSetVector StoreGroups; // Holds all interleaved load groups temporarily. @@ -5132,34 +5226,48 @@ // 1. A and B have the same stride. // 2. A and B have the same memory object size. // 3. B belongs to the group according to the distance. - // - // The bottom-up order can avoid breaking the Write-After-Write dependences - // between two pointers of the same base. - // E.g. A[i] = a; (1) - // A[i] = b; (2) - // A[i+1] = c (3) - // We form the group (2)+(3) in front, so (1) has to form groups with accesses - // above (1), which guarantees that (1) is always above (2). for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; ++I) { Instruction *A = I->first; StrideDescriptor DesA = I->second; - InterleaveGroup *Group = getInterleaveGroup(A); - if (!Group) { - DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); - Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + // Initialize a group for A if it has an allowable stride. Even if we don't + // create a group for A, we continue with the bottom-up algorithm to ensure + // we don't break any of A's dependences. + InterleaveGroup *Group = nullptr; + if (isStrided(DesA.Stride)) { + Group = getInterleaveGroup(A); + if (!Group) { + DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); + Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + } + if (A->mayWriteToMemory()) + StoreGroups.insert(Group); + else + LoadGroups.insert(Group); } - if (A->mayWriteToMemory()) - StoreGroups.insert(Group); - else - LoadGroups.insert(Group); - for (auto II = std::next(I); II != E; ++II) { Instruction *B = II->first; StrideDescriptor DesB = II->second; + // If A and B cannot be reordered, we can't add additional instructions + // to A's group since this may cause illegal code motion. If B is already + // in a group release it. + if (!canReorderMemAccesses(&*II, &*I)) { + if (isInterleaved(B)) { + InterleaveGroup *StoreGroup = getInterleaveGroup(B); + StoreGroups.remove(StoreGroup); + releaseGroup(StoreGroup); + } + break; + } + + // At this point, we've checked for illegal code motion. If either A or B + // isn't strided, there's nothing left to do. + if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride)) + continue; + // Ignore if B is already in a group or B is a different memory operation. if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) continue; Index: test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses.ll +++ test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -555,4 +555,215 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } +; Check vectorization of interleaved access groups in the presence of +; dependences (PR27626). The following tests check that we don't reorder +; dependent loads and stores when generating code for interleaved access +; groups. Stores should be scalarized because the required code motion would +; break dependences, and the remaining interleaved load groups should have +; gaps. + +; PR27626_0: Ensure a strided store is not moved after a dependent (zero +; distance) strided load. + +; void PR27626_0(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i].x; +; } +; } + +; CHECK-LABEL: @PR27626_0( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 +; CHECK: store i32 %[[X1]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 +; CHECK: store i32 %[[X2]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 +; CHECK: store i32 %[[X3]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 +; CHECK: store i32 %[[X4]], {{.*}} + +%pair.i32 = type { i32, i32 } +define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 4 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32 *%p_i.y, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_1: Ensure a strided load is not moved before a dependent (zero +; distance) strided store. + +; void PR27626_1(struct pair *p, int n) { +; int s = 0; +; for (int i = 0; i < n; i++) { +; p[i].y = p[i].x; +; s += p[i].y +; } +; } + +; CHECK-LABEL: @PR27626_1( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 +; CHECK: store i32 %[[X1:.+]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 +; CHECK: store i32 %[[X2:.+]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 +; CHECK: store i32 %[[X3:.+]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 +; CHECK: store i32 %[[X4:.+]], {{.*}} +; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] + +define i32 @PR27626_1(%pair.i32 *%p, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %s = phi i32 [ %2, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32* %p_i.y, align 4 + %1 = load i32, i32* %p_i.y, align 4 + %2 = add nsw i32 %1, %s + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i32 [ %2, %for.body ] + ret i32 %3 +} + +; PR27626_2: Ensure a strided store is not moved after a dependent (negative +; distance) strided load. + +; void PR27626_2(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i - 1].x; +; } +; } + +; CHECK-LABEL: @PR27626_2( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 +; CHECK: store i32 %[[X1]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 +; CHECK: store i32 %[[X2]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 +; CHECK: store i32 %[[X3]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 +; CHECK: store i32 %[[X4]], {{.*}} + +define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i_minus_1 = add nuw nsw i64 %i, -1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 4 + %0 = load i32, i32* %p_i_minus_1.x, align 4 + store i32 %0, i32 *%p_i.y, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_3: Ensure a strided load is not moved before a dependent (negative +; distance) strided store. + +; void PR27626_3(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i + 1].y = p[i].x; +; s += p[i].y; +; } +; } + +; CHECK-LABEL: @PR27626_3( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 +; CHECK: store i32 %[[X1:.+]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 +; CHECK: store i32 %[[X2:.+]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 +; CHECK: store i32 %[[X3:.+]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 +; CHECK: store i32 %[[X4:.+]], {{.*}} +; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] + +define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %s = phi i32 [ %2, %for.body ], [ 0, %entry ] + %i_plus_1 = add nuw nsw i64 %i, 1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32* %p_i_plus_1.y, align 4 + %1 = load i32, i32* %p_i.y, align 4 + %2 = add nsw i32 %1, %s + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i32 [ %2, %for.body ] + ret i32 %3 +} + attributes #0 = { "unsafe-fp-math"="true" }