Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -827,8 +827,8 @@ class InterleavedAccessInfo { public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, - DominatorTree *DT) - : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {} + DominatorTree *DT, LoopInfo *LI) + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), RequiresScalarEpilogue(false) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -869,6 +869,7 @@ PredicatedScalarEvolution &PSE; Loop *TheLoop; DominatorTree *DT; + LoopInfo *LI; /// True if the loop may contain non-reversed interleaved groups with /// out-of-bounds accesses. We ensure we don't speculatively access memory @@ -917,6 +918,21 @@ void collectConstStridedAccesses( MapVector &StrideAccesses, const ValueToValueMap &Strides); + + /// \brief Returns true if there is a loop-independent read-after-write + /// dependence between instructions \p R and \p W. + /// + /// Such a dependence exists if the dependence distance between the + /// instructions is zero and the write may be executed before the read. It + /// should be enough to just consider zero distance dependences (and not + /// worry about the affects of vectorization) since store groups with gaps + /// will be removed. + bool isLoopIndependentRAW(Instruction *R, Instruction *W, int Distance) { + if (!Distance && R->mayReadFromMemory() && W->mayWriteToMemory() && + !DT->dominates(R, W)) + return true; + return false; + } }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -1247,13 +1263,14 @@ DominatorTree *DT, TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, - LoopAccessAnalysis *LAA, + LoopAccessAnalysis *LAA, LoopInfo *LI, LoopVectorizationRequirements *R, LoopVectorizeHints *H) : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), - Requirements(R), Hints(H) {} + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), + InterleaveInfo(PSE, L, DT, LI), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), + Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1854,7 +1871,7 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, LI, &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); @@ -4974,7 +4991,17 @@ // Holds load/store instructions in program order. SmallVector AccessList; - for (auto *BB : TheLoop->getBlocks()) { + // Since it's desired that the load/store instructions be maintained in + // "program order" for the interleaved access analysis, we have to visit the + // blocks in the loop in reverse postorder (i.e., in a topological order). + // Such an ordering will ensure that any load/store that may be executed + // before a second load/store will precede the second load/store in the + // AccessList. + LoopBlocksDFS DFS(TheLoop); + DFS.perform(LI); + for (LoopBlocksDFS::RPOIterator I = DFS.beginRPO(), E = DFS.endRPO(); I != E; + ++I) { + BasicBlock *BB = *I; bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); for (auto &I : *BB) { @@ -5027,18 +5054,19 @@ // sizes or underlying bases. // // For pointers sharing the same stride, element size and underlying base, no -// need to worry about Read-After-Write dependences and Write-After-Read -// dependences. -// -// E.g. The RAW dependence: A[i] = a; -// b = A[i]; -// This won't exist as it is a store-load forwarding conflict, which has -// already been checked and forbidden in the dependence check. +// need to worry about Write-After-Read dependences. However, Read-After-Write +// dependences must be considered. // // E.g. The WAR dependence: a = A[i]; // (1) // A[i] = b; // (2) // The store group of (2) is always inserted at or below (2), and the load group // of (1) is always inserted at or above (1). The dependence is safe. +// +// E.g. The RAW dependence: A[i] = a; // (1) +// b = A[i]; // (2) +// We have to ensure that the insert position of the store group of (1) may not +// be below the insert position of the load group of (2). If this condition +// does not hold, (1) and (2) would be reordered, breaking the dependence. void InterleavedAccessInfo::analyzeInterleaving( const ValueToValueMap &Strides) { DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); @@ -5054,6 +5082,10 @@ SmallSetVector StoreGroups; // Holds all interleaved load groups temporarily. SmallSetVector LoadGroups; + // Holds store instructions whose interleaved groups must be removed from + // consideration due to the possibility of breaking read-after-write + // dependences. + SmallPtrSet StoresToRemove; // Search the load-load/write-write pair B-A in bottom-up order and try to // insert B into the interleave group of A according to 3 rules: @@ -5084,14 +5116,14 @@ else LoadGroups.insert(Group); + // Holds store instructions that may be involved in a broken + // read-after-write dependence. + SmallVector LoopIndependentRAWStores; + for (auto II = std::next(I); II != E; ++II) { Instruction *B = II->first; StrideDescriptor DesB = II->second; - // Ignore if B is already in a group or B is a different memory operation. - if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) - continue; - // Check the rule 1 and 2. if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size) continue; @@ -5104,11 +5136,55 @@ int DistanceToA = DistToA->getAPInt().getSExtValue(); + // Check for a loop-independent read-after-write dependence from B (a + // possible store) to A (a possible load). + // + // If a dependence may exist, and the store is already in an interleaved + // group, the store will be "moved" to the insert position of the group. + // If the insert position may follow the load (it should, given the + // bottom-up order in which we are visiting the instructions), the + // read-after-write dependence would be broken. Thus, mark the store for + // removal, and prevent further loads from being added to the group. If + // additional loads were to be added to the group, this would cause the + // current load to "move" to a new location, breaking the dependence. + // + // If a dependence may exist, and the store is not already in an + // interleaved group, this is not necessary a problem. We don't yet know + // if the load or store will be "moved" to a new insert position. Save + // the store for later consideration. If we try and add another load to + // the group, the dependence would then be broken. + if (isLoopIndependentRAW(A, B, DistanceToA)) { + if (isInterleaved(B)) + if (!DT->dominates(getInterleaveGroup(B)->getInsertPos(), A)) { + StoresToRemove.insert(B); + break; + } + LoopIndependentRAWStores.push_back(B); + } + + // Ignore if B is already in a group or B is a different memory operation. + if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) + continue; + // Skip if the distance is not multiple of size as they are not in the // same group. if (DistanceToA % static_cast(DesA.Size)) continue; + // Ensure that "moving" A (a load) to the insert position of B (a load) + // will not violate a loop-independent read-after-write dependence. + // + // The store instructions involved in any read-after-write dependences + // have already been recorded. If a dependence would be violated, mark + // the store for removal, and prevent further loads from being added to + // the group by breaking. + if (A->mayReadFromMemory() && !LoopIndependentRAWStores.empty()) { + for (Instruction *Store : LoopIndependentRAWStores) + if (!DT->dominates(Store, B)) + StoresToRemove.insert(Store); + break; + } + // The index of B is the index of A plus the related index to A. int IndexB = Group->getIndex(A) + DistanceToA / static_cast(DesA.Size); @@ -5126,6 +5202,17 @@ } // Iteration on instruction B } // Iteration on instruction A + // Remove interleaved store groups having members involved in a potentially + // broken read-after-write dependence. We simply remove the entire group + // instead of the offending members since store groups with gaps will be + // removed anyway. + for (Instruction *Store : StoresToRemove) + if (isInterleaved(Store)) { + InterleaveGroup *StoreGroup = getInterleaveGroup(Store); + StoreGroups.remove(StoreGroup); + releaseGroup(StoreGroup); + } + // Remove interleaved store groups with gaps. for (InterleaveGroup *Group : StoreGroups) if (Group->getNumMembers() != Group->getFactor()) Index: test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses.ll +++ test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -555,4 +555,76 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } +; Check vectorization of interleaved accesses groups in the presence of a +; loop-independent read-after-write dependence. When generating code, +; interleaved loads are inserted at the location of the first load, and +; interleaved stores are inserted at the location of the last store. This +; load/store re-ordering can break RAW dependences. +; +; In this test, we create two separate load groups with gaps to avoid breaking +; the dependence. We do the same for the stores. Because the interleaved store +; groups have gaps, they are scalarized. Because the interleaved load groups +; have gaps, we must execute at least on iteration of the scalar epligue loop. + +; struct pair { +; int x; +; int y; +; }; +; +; void PR27626(struct pair *p, int n) { +; for (int i = 0; i < n; i++) { +; p->x = p->y; +; p->y = p->x; +; } +; } + +; CHECK-LABEL: @PR27626( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[L1:.+]] = load <8 x i64>, <8 x i64>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i64> %[[L1]], i32 0 +; CHECK: store i64 %[[X1]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i64> %[[L1]], i32 2 +; CHECK: store i64 %[[X2]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i64> %[[L1]], i32 4 +; CHECK: store i64 %[[X3]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i64> %[[L1]], i32 6 +; CHECK: store i64 %[[X4]], {{.*}} +; CHECK: %[[L2:.+]] = load <8 x i64>, <8 x i64>* {{.*}} +; CHECK: %[[X5:.+]] = extractelement <8 x i64> %[[L2]], i32 0 +; CHECK: store i64 %[[X5]], {{.*}} +; CHECK: %[[X6:.+]] = extractelement <8 x i64> %[[L2]], i32 2 +; CHECK: store i64 %[[X6]], {{.*}} +; CHECK: %[[X7:.+]] = extractelement <8 x i64> %[[L2]], i32 4 +; CHECK: store i64 %[[X7]], {{.*}} +; CHECK: %[[X8:.+]] = extractelement <8 x i64> %[[L2]], i32 6 +; CHECK: store i64 %[[X8]], {{.*}} +; CHECK: middle.block: +; CHECK: br i1 false, label %for.end, label %scalar.ph + +define void @PR27626(%pair *%p, i64 %n) { + +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0 + %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1 + %0 = load i64, i64* %p.1, align 8 + store i64 %0, i64* %p.0, align 8 + %1 = load i64, i64* %p.0, align 8 + store i64 %1, i64 *%p.1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + attributes #0 = { "unsafe-fp-math"="true" }