Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -264,6 +264,14 @@ SmallVector getInstructionsForAccess(Value *Ptr, bool isWrite) const; + /// \brief Check the dependence for two accesses with the same stride \p + /// Stride. \p Distance is the positive distance between the accesses and \p + /// TypeByteSize is the type size in bytes of the accesses. + /// + /// \returns true if the accesses are independent. + static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride, + unsigned TypeByteSize); + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and /// applies dynamic knowledge to simplify SCEV expressions and convert them Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1113,13 +1113,9 @@ return false; } -/// \brief Check the dependence for two accesses with the same stride \p Stride. -/// \p Distance is the positive distance and \p TypeByteSize is type size in -/// bytes. -/// -/// \returns true if they are independent. -static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride, - unsigned TypeByteSize) { +bool MemoryDepChecker::areStridedAccessesIndependent(unsigned Distance, + unsigned Stride, + unsigned TypeByteSize) { assert(Stride > 1 && "The stride must be greater than 1"); assert(TypeByteSize > 0 && "The type size in byte must be non-zero"); assert(Distance > 0 && "The distance must be non-zero"); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -833,8 +833,8 @@ class InterleavedAccessInfo { public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, - DominatorTree *DT) - : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {} + DominatorTree *DT, LoopInfo *LI) + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), RequiresScalarEpilogue(false) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -875,6 +875,12 @@ /// out-of-bounds requires a scalar epilogue iteration for correctness. bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } + /// \brief Returns true if \p Stride is allowed in an interleaved group. + static bool isStrideAllowed(int Stride) { + unsigned Factor = std::abs(Stride); + return Factor >= 2 && Factor <= MaxInterleaveGroupFactor; + } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. @@ -883,6 +889,7 @@ PredicatedScalarEvolution &PSE; Loop *TheLoop; DominatorTree *DT; + LoopInfo *LI; /// True if the loop may contain non-reversed interleaved groups with /// out-of-bounds accesses. We ensure we don't speculatively access memory @@ -931,6 +938,12 @@ void collectConstStridedAccesses( MapVector &StrideAccesses, const ValueToValueMap &Strides); + + /// \brief Returns true if memory accesses \p A and \p B can be reordered. We + /// use the provided StrideDescriptors and MemoryDepChecker to determine if + /// any dependences would be violated. + bool canReorderMemAccs(Instruction *A, Instruction *B, StrideDescriptor &DesA, + StrideDescriptor &DesB); }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -1261,13 +1274,14 @@ DominatorTree *DT, TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, - LoopAccessAnalysis *LAA, + LoopAccessAnalysis *LAA, LoopInfo *LI, LoopVectorizationRequirements *R, LoopVectorizeHints *H) : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), - Requirements(R), Hints(H) {} + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), + InterleaveInfo(PSE, L, DT, LI), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), + Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1873,7 +1887,7 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, LI, &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); @@ -4991,7 +5005,17 @@ // Holds load/store instructions in program order. SmallVector AccessList; - for (auto *BB : TheLoop->getBlocks()) { + // Since it's desired that the load/store instructions be maintained in + // "program order" for the interleaved access analysis, we have to visit the + // blocks in the loop in reverse postorder (i.e., in a topological order). + // Such an ordering will ensure that any load/store that may be executed + // before a second load/store will precede the second load/store in the + // AccessList. + LoopBlocksDFS DFS(TheLoop); + DFS.perform(LI); + for (LoopBlocksDFS::RPOIterator I = DFS.beginRPO(), E = DFS.endRPO(); I != E; + ++I) { + BasicBlock *BB = *I; bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); for (auto &I : *BB) { @@ -5016,13 +5040,6 @@ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides); - // The factor of the corresponding interleave group. - unsigned Factor = std::abs(Stride); - - // Ignore the access if the factor is too small or too large. - if (Factor < 2 || Factor > MaxInterleaveGroupFactor) - continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -5036,26 +5053,98 @@ } } -// Analyze interleaved accesses and collect them into interleave groups. +bool InterleavedAccessInfo::canReorderMemAccs(Instruction *A, Instruction *B, + StrideDescriptor &DesA, + StrideDescriptor &DesB) { + + // Lambda expression for getting the pointer operand of the memory accesses. + auto getPointerOperand = [&](Instruction *I) -> Value * { + if (auto *Load = dyn_cast(I)) + return Load->getPointerOperand(); + if (auto *Store = dyn_cast(I)) + return Store->getPointerOperand(); + return nullptr; + }; + + // If neither instruction writes to memory, they can safely be reordered. + if (!A->mayWriteToMemory() && !B->mayWriteToMemory()) + return true; + + // If the distance is non-constant, the vectorized loop should be guarded + // with run-time pointer aliasing checks. The checks should ensure that the + // memory locations A and B access throughout the loop are disjoint. Because + // the memory must be disjoint if the checks succeed, it's safe to reorder. + auto *Distance = + dyn_cast(PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev)); + if (!Distance) + return true; + + // LAA allows non-positive dependence distances for vectorization. However, + // since we may reorder the instructions, the direction of the dependence + // doesn't really matter, so we take the absolute value. If the distance is + // zero, the instructions can't be reordered. + auto NonNegDistance = std::abs(Distance->getAPInt().getSExtValue()); + if (NonNegDistance == 0) + return false; + + // At this point, we know there is a non-zero distance between the accesses. + // If they are strided, we can try to prove them independent. Otherwise, they + // are dependent. First, we get the interleave factor of the accesses. If + // they aren't interleaved or they have different strides, don't reorder. + auto FactorA = std::abs(DesA.Stride); + auto FactorB = std::abs(DesB.Stride); + if (FactorA != FactorB || FactorA <= 1) + return false; + + // Get the types of the pointer operands. If they're not the same, don't + // reorder. + auto *PtrTyA = cast(getPointerOperand(A)->getType()); + auto *PtrTyB = cast(getPointerOperand(B)->getType()); + if (PtrTyA != PtrTyB) + return false; + + // We need the module's data layout to determine the size of the type the + // accesses read from and write to. + auto DL = A->getModule()->getDataLayout(); + + // Finally, try and prove independence. If the strided accesses are + // independent, they can be reordered. + return MemoryDepChecker::areStridedAccessesIndependent( + NonNegDistance, FactorA, DL.getTypeAllocSize(PtrTyA->getElementType())); +} + +// Analyze interleaved accesses and collect them into interleaved load and +// store groups. +// +// When generating code for an interleaved load group, we effectively hoist all +// loads in the group to the location of the first load in program order. When +// generating code for an interleaved store group, we sink all stores to the +// location of the last store. This code motion can change the order of load +// and store instructions and may break dependences. The inserted run-time +// pointer aliasing checks ensure the absence of some dependences. However, we +// need to explicitly check that we won't violate the dependences allowed for +// vectorization. // -// Notice that the vectorization on interleaved groups will change instruction -// orders and may break dependences. But the memory dependence check guarantees -// that there is no overlap between two pointers of different strides, element -// sizes or underlying bases. +// A vectorizable loop can legally contain dependences at constant non-positive +// distances and at constant positive distances if they are greater than or +// equal to the threshold used to compute the vectorization factor. Dependences +// with non-constant distances require aliasing checks, so we can assume these +// dependences are not present in the vectorized loop. // -// For pointers sharing the same stride, element size and underlying base, no -// need to worry about Read-After-Write dependences and Write-After-Read -// dependences. +// The code generation strategy mentioned above ensures that we won't violate +// any write-after-read (WAR) dependences. // -// E.g. The RAW dependence: A[i] = a; -// b = A[i]; -// This won't exist as it is a store-load forwarding conflict, which has -// already been checked and forbidden in the dependence check. +// E.g., for the WAR dependence: a = A[i]; // (1) +// A[i] = b; // (2) // -// E.g. The WAR dependence: a = A[i]; // (1) -// A[i] = b; // (2) -// The store group of (2) is always inserted at or below (2), and the load group -// of (1) is always inserted at or above (1). The dependence is safe. +// The store group of (2) is always inserted at or below (2), and the load +// group of (1) is always inserted at or above (1). Thus, the instructions will +// never be reordered. All other dependences are checked to ensure the +// correctness of the instruction reordering. +// +// The algorithm visits all memory accesses in the loop in bottom-up program +// order. Program order is established by traversing the blocks in the loop in +// reverse postorder when collecting the accesses. void InterleavedAccessInfo::analyzeInterleaving( const ValueToValueMap &Strides) { DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); @@ -5077,34 +5166,51 @@ // 1. A and B have the same stride. // 2. A and B have the same memory object size. // 3. B belongs to the group according to the distance. - // - // The bottom-up order can avoid breaking the Write-After-Write dependences - // between two pointers of the same base. - // E.g. A[i] = a; (1) - // A[i] = b; (2) - // A[i+1] = c (3) - // We form the group (2)+(3) in front, so (1) has to form groups with accesses - // above (1), which guarantees that (1) is always above (2). for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; ++I) { Instruction *A = I->first; StrideDescriptor DesA = I->second; - InterleaveGroup *Group = getInterleaveGroup(A); - if (!Group) { - DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); - Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + // Initialize a group for A if it has an allowable stride. Even if we don't + // create a group for A, we continue with the bottom-up algorithm to ensure + // we don't break any of A's dependences. + InterleaveGroup *Group = nullptr; + if (isStrideAllowed(DesA.Stride)) { + Group = getInterleaveGroup(A); + if (!Group) { + DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); + Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + } + if (A->mayWriteToMemory()) + StoreGroups.insert(Group); + else + LoadGroups.insert(Group); } - if (A->mayWriteToMemory()) - StoreGroups.insert(Group); - else - LoadGroups.insert(Group); - for (auto II = std::next(I); II != E; ++II) { Instruction *B = II->first; StrideDescriptor DesB = II->second; + // If instructions A and B cannot be reordered because they're in a + // dependence relation, we can't add additional instructions to A's group + // since this may cause illegal code motion. If B is already in a group + // release it. We check that B is a write because our code motion won't + // ever violate WAR dependences. + if (B->mayWriteToMemory() && !canReorderMemAccs(A, B, DesA, DesB)) { + if (isInterleaved(B)) { + InterleaveGroup *StoreGroup = getInterleaveGroup(B); + StoreGroups.remove(StoreGroup); + releaseGroup(StoreGroup); + } + break; + } + + // At this point, we've checked for code motion that would break + // dependences. If A or B has a stride that's not allowed, there's + // nothing left to do. + if (!isStrideAllowed(DesA.Stride) || !isStrideAllowed(DesB.Stride)) + continue; + // Ignore if B is already in a group or B is a different memory operation. if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) continue; Index: test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses.ll +++ test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -555,4 +555,108 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } +; Check vectorization of interleaved access groups in the presence of +; dependences. When generating code, interleaved loads are inserted at the +; location of the first load in a group, and interleaved stores are inserted at +; the location of the last store in a group. However, all dependent accesses +; should occur in the same order. Thus, we prevent group formation for +; dependent accesses. +; +; In the following tests, the stores are scalarized because the required code +; motion would break dependences. The remaining interleaved load groups have +; gaps, so we must execute at least one iteration of the scalar epilogue loop. + +; void PR27626_0(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i].x; +; } +; } + +; CHECK-LABEL: @PR27626_0( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 +; CHECK: store i32 %[[X1]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 +; CHECK: store i32 %[[X2]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 +; CHECK: store i32 %[[X3]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 +; CHECK: store i32 %[[X4]], {{.*}} +; CHECK: middle.block: +; CHECK: br i1 false, label %for.end, label %scalar.ph + +%pair.i32 = type { i32, i32 } +define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 8 + %0 = load i32, i32* %p_i.x, align 8 + store i32 %0, i32 *%p_i.y, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; void PR27626_1(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i + 1].x = z; +; p[i].y = p[i].x; +; } +; } + +; CHECK-LABEL: @PR27626_1( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 +; CHECK: store i32 %[[X1]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 +; CHECK: store i32 %[[X2]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 +; CHECK: store i32 %[[X3]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 +; CHECK: store i32 %[[X4]], {{.*}} +; CHECK: middle.block: +; CHECK: br i1 false, label %for.end, label %scalar.ph + +define void @PR27626_1(%pair.i32 *%p, i64 %n, i32 %z) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i_plus_1 = add nuw nsw i64 %i, 1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i_plus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i_plus_1.x, align 8 + %0 = load i32, i32* %p_i.x, align 8 + store i32 %0, i32 *%p_i.y, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + attributes #0 = { "unsafe-fp-math"="true" }