Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -833,8 +833,9 @@ class InterleavedAccessInfo { public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, - DominatorTree *DT) - : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {} + DominatorTree *DT, LoopInfo *LI) + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr), + RequiresScalarEpilogue(false) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -875,6 +876,9 @@ /// out-of-bounds requires a scalar epilogue iteration for correctness. bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } + /// \brief Initialize the LoopAccessInfo used for dependence checking. + void setLAI(const LoopAccessInfo *Info) { LAI = Info; } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. @@ -883,6 +887,8 @@ PredicatedScalarEvolution &PSE; Loop *TheLoop; DominatorTree *DT; + LoopInfo *LI; + const LoopAccessInfo *LAI; /// True if the loop may contain non-reversed interleaved groups with /// out-of-bounds accesses. We ensure we don't speculatively access memory @@ -892,6 +898,10 @@ /// Holds the relationships between the members and the interleave group. DenseMap InterleaveGroupMap; + /// Holds dependences among the memory accesses in the loop. It maps a source + /// access to a set of dependent sink accesses. + DenseMap> Dependences; + /// \brief The descriptor for a strided memory access. struct StrideDescriptor { StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, @@ -931,6 +941,40 @@ void collectConstStridedAccesses( MapVector &StrideAccesses, const ValueToValueMap &Strides); + + /// \brief Returns true if \p Stride is allowed in an interleaved group. + static bool isStrideAllowed(int Stride) { + unsigned Factor = std::abs(Stride); + return Factor >= 2 && Factor <= MaxInterleaveGroupFactor; + } + + /// \brief Returns true if LoopAccessInfo can be used for dependence queries. + bool areDependencesValid() const { + return LAI && LAI->getDepChecker().getDependences(); + } + + /// \brief Returns true if \p Source and \p Sink can be reordered. + /// + /// We can reorder Source and Sink if we know there isn't a dependence from + /// Source to Sink. Note that this determination is conservative since some + /// dependences, such as some backward dependences, could potentially be + /// reordered. + bool canReorderMemAccesses(Instruction *Source, Instruction *Sink) const { + return areDependencesValid() && (!Dependences.count(Source) || + !Dependences.lookup(Source).count(Sink)); + } + + /// \brief Collect the dependences from LoopAccessInfo. + /// + /// We process the dependences once during the interleaved access analysis to + /// enable constant-time dependence queries. + void collectDependences() { + if (!areDependencesValid()) + return; + auto *Deps = LAI->getDepChecker().getDependences(); + for (auto Dep : *Deps) + Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI)); + } }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -1261,13 +1305,14 @@ DominatorTree *DT, TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, - LoopAccessAnalysis *LAA, + LoopAccessAnalysis *LAA, LoopInfo *LI, LoopVectorizationRequirements *R, LoopVectorizeHints *H) : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), - Requirements(R), Hints(H) {} + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), + InterleaveInfo(PSE, L, DT, LI), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), + Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1873,7 +1918,7 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, LI, &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); @@ -4879,6 +4924,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { LAI = &LAA->getInfo(TheLoop, Strides); + InterleaveInfo.setLAI(LAI); auto &OptionalReport = LAI->getReport(); if (OptionalReport) emitAnalysis(VectorizationReport(*OptionalReport)); @@ -4993,7 +5039,17 @@ // Holds load/store instructions in program order. SmallVector AccessList; - for (auto *BB : TheLoop->getBlocks()) { + // Since it's desired that the load/store instructions be maintained in + // "program order" for the interleaved access analysis, we have to visit the + // blocks in the loop in reverse postorder (i.e., in a topological order). + // Such an ordering will ensure that any load/store that may be executed + // before a second load/store will precede the second load/store in the + // AccessList. + LoopBlocksDFS DFS(TheLoop); + DFS.perform(LI); + for (LoopBlocksDFS::RPOIterator I = DFS.beginRPO(), E = DFS.endRPO(); I != E; + ++I) { + BasicBlock *BB = *I; bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); for (auto &I : *BB) { @@ -5018,13 +5074,6 @@ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides); - // The factor of the corresponding interleave group. - unsigned Factor = std::abs(Stride); - - // Ignore the access if the factor is too small or too large. - if (Factor < 2 || Factor > MaxInterleaveGroupFactor) - continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -5038,26 +5087,29 @@ } } -// Analyze interleaved accesses and collect them into interleave groups. +// Analyze interleaved accesses and collect them into interleaved load and +// store groups. // -// Notice that the vectorization on interleaved groups will change instruction -// orders and may break dependences. But the memory dependence check guarantees -// that there is no overlap between two pointers of different strides, element -// sizes or underlying bases. +// When generating code for an interleaved load group, we effectively hoist all +// loads in the group to the location of the first load in program order. When +// generating code for an interleaved store group, we sink all stores to the +// location of the last store. This code motion can change the order of load +// and store instructions and may break dependences. // -// For pointers sharing the same stride, element size and underlying base, no -// need to worry about Read-After-Write dependences and Write-After-Read -// dependences. +// The code generation strategy mentioned above ensures that we won't violate +// any write-after-read (WAR) dependences. // -// E.g. The RAW dependence: A[i] = a; -// b = A[i]; -// This won't exist as it is a store-load forwarding conflict, which has -// already been checked and forbidden in the dependence check. +// E.g., for the WAR dependence: a = A[i]; // (1) +// A[i] = b; // (2) // -// E.g. The WAR dependence: a = A[i]; // (1) -// A[i] = b; // (2) -// The store group of (2) is always inserted at or below (2), and the load group -// of (1) is always inserted at or above (1). The dependence is safe. +// The store group of (2) is always inserted at or below (2), and the load +// group of (1) is always inserted at or above (1). Thus, the instructions will +// never be reordered. All other dependences are checked to ensure the +// correctness of the instruction reordering. +// +// The algorithm visits all memory accesses in the loop in bottom-up program +// order. Program order is established by traversing the blocks in the loop in +// reverse postorder when collecting the accesses. void InterleavedAccessInfo::analyzeInterleaving( const ValueToValueMap &Strides) { DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); @@ -5069,6 +5121,9 @@ if (StrideAccesses.empty()) return; + // Collect the dependences in the loop. + collectDependences(); + // Holds all interleaved store groups temporarily. SmallSetVector StoreGroups; // Holds all interleaved load groups temporarily. @@ -5079,34 +5134,51 @@ // 1. A and B have the same stride. // 2. A and B have the same memory object size. // 3. B belongs to the group according to the distance. - // - // The bottom-up order can avoid breaking the Write-After-Write dependences - // between two pointers of the same base. - // E.g. A[i] = a; (1) - // A[i] = b; (2) - // A[i+1] = c (3) - // We form the group (2)+(3) in front, so (1) has to form groups with accesses - // above (1), which guarantees that (1) is always above (2). for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; ++I) { Instruction *A = I->first; StrideDescriptor DesA = I->second; - InterleaveGroup *Group = getInterleaveGroup(A); - if (!Group) { - DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); - Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + // Initialize a group for A if it has an allowable stride. Even if we don't + // create a group for A, we continue with the bottom-up algorithm to ensure + // we don't break any of A's dependences. + InterleaveGroup *Group = nullptr; + if (isStrideAllowed(DesA.Stride)) { + Group = getInterleaveGroup(A); + if (!Group) { + DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); + Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + } + if (A->mayWriteToMemory()) + StoreGroups.insert(Group); + else + LoadGroups.insert(Group); } - if (A->mayWriteToMemory()) - StoreGroups.insert(Group); - else - LoadGroups.insert(Group); - for (auto II = std::next(I); II != E; ++II) { Instruction *B = II->first; StrideDescriptor DesB = II->second; + // If instructions A and B cannot be reordered because they're in a + // dependence relation, we can't add additional instructions to A's group + // since this may cause illegal code motion. If B is already in a group + // release it. We check that B is a write because our code motion won't + // ever violate WAR dependences. + if (B->mayWriteToMemory() && !canReorderMemAccesses(B, A)) { + if (isInterleaved(B)) { + InterleaveGroup *StoreGroup = getInterleaveGroup(B); + StoreGroups.remove(StoreGroup); + releaseGroup(StoreGroup); + } + break; + } + + // At this point, we've checked for code motion that would break + // dependences. If A or B has a stride that's not allowed, there's + // nothing left to do. + if (!isStrideAllowed(DesA.Stride) || !isStrideAllowed(DesB.Stride)) + continue; + // Ignore if B is already in a group or B is a different memory operation. if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) continue; Index: test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses.ll +++ test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -555,4 +555,108 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } +; Check vectorization of interleaved access groups in the presence of +; dependences. When generating code, interleaved loads are inserted at the +; location of the first load in a group, and interleaved stores are inserted at +; the location of the last store in a group. However, all dependent accesses +; should occur in the same order. Thus, we prevent group formation for +; dependent accesses. +; +; In the following tests, the stores are scalarized because the required code +; motion would break dependences. The remaining interleaved load groups have +; gaps, so we must execute at least one iteration of the scalar epilogue loop. + +; void PR27626_0(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i].x; +; } +; } + +; CHECK-LABEL: @PR27626_0( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 +; CHECK: store i32 %[[X1]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 +; CHECK: store i32 %[[X2]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 +; CHECK: store i32 %[[X3]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 +; CHECK: store i32 %[[X4]], {{.*}} +; CHECK: middle.block: +; CHECK: br i1 false, label %for.end, label %scalar.ph + +%pair.i32 = type { i32, i32 } +define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 8 + %0 = load i32, i32* %p_i.x, align 8 + store i32 %0, i32 *%p_i.y, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; void PR27626_1(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i + 1].x = z; +; p[i].y = p[i].x; +; } +; } + +; CHECK-LABEL: @PR27626_1( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 +; CHECK: store i32 %[[X1]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 +; CHECK: store i32 %[[X2]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 +; CHECK: store i32 %[[X3]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 +; CHECK: store i32 %[[X4]], {{.*}} +; CHECK: middle.block: +; CHECK: br i1 false, label %for.end, label %scalar.ph + +define void @PR27626_1(%pair.i32 *%p, i64 %n, i32 %z) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i_plus_1 = add nuw nsw i64 %i, 1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i_plus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i_plus_1.x, align 8 + %0 = load i32, i32* %p_i.x, align 8 + store i32 %0, i32 *%p_i.y, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + attributes #0 = { "unsafe-fp-math"="true" }