Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -847,8 +847,9 @@ class InterleavedAccessInfo { public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, - DominatorTree *DT) - : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {} + DominatorTree *DT, LoopInfo *LI) + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr), + RequiresScalarEpilogue(false) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -889,6 +890,9 @@ /// out-of-bounds requires a scalar epilogue iteration for correctness. bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } + /// \brief Initialize the LoopAccessInfo used for dependence checking. + void setLAI(const LoopAccessInfo *Info) { LAI = Info; } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. @@ -897,6 +901,8 @@ PredicatedScalarEvolution &PSE; Loop *TheLoop; DominatorTree *DT; + LoopInfo *LI; + const LoopAccessInfo *LAI; /// True if the loop may contain non-reversed interleaved groups with /// out-of-bounds accesses. We ensure we don't speculatively access memory @@ -906,6 +912,10 @@ /// Holds the relationships between the members and the interleave group. DenseMap InterleaveGroupMap; + /// Holds dependences among the memory accesses in the loop. It maps a source + /// access to a set of dependent sink accesses. + DenseMap> Dependences; + /// \brief The descriptor for a strided memory access. struct StrideDescriptor { StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, @@ -920,6 +930,9 @@ unsigned Align; // The alignment of this access. }; + /// \brief A type for holding instructions and their stride descriptors. + typedef std::pair StrideEntry; + /// \brief Create a new interleave group with the given instruction \p Instr, /// stride \p Stride and alignment \p Align. /// @@ -945,6 +958,78 @@ void collectConstStridedAccesses( MapVector &StrideAccesses, const ValueToValueMap &Strides); + + /// \brief Returns true if \p Stride is allowed in an interleaved group. + static bool isStrided(int Stride) { + unsigned Factor = std::abs(Stride); + return Factor >= 2 && Factor <= MaxInterleaveGroupFactor; + } + + /// \brief Returns true if LoopAccessInfo can be used for dependence queries. + bool areDependencesValid() const { + return LAI && LAI->getDepChecker().getDependences(); + } + + /// \brief Returns true if memory accesses \p B and \p A can be reordered, if + /// necessary, when constructing interleaved groups. + /// + /// \p B must precede \p A in program order. We return false if reordering is + /// not necessary or is prevented because \p B and \p A may be dependent. + bool canReorderMemAccessesForInterleavedGroups(StrideEntry *B, + StrideEntry *A) const { + + // Code motion for interleaved accesses can potentially hoist strided loads + // and sink strided stores. The code below checks the legality of the + // following two conditions: + // + // 1. Potentially moving a strided load (A) before any store (B) that + // precedes A, or + // + // 2. Potentially moving a strided store (B) after any load or store (A) + // that B precedes. + // + // It's legal to reorder B and A if we know there isn't a dependence from B + // to A. Note that this determination is conservative since some + // dependences could potentially be reordered safely. + + // B is potentially the source of a dependence. + auto *Src = B->first; + auto SrcDes = B->second; + + // A is potentially the sink of a dependence. + auto *Sink = A->first; + auto SinkDes = A->second; + + // Code motion for interleaved accesses can't violate WAR dependences. + // Thus, reordering is legal if the source isn't a write. + if (!Src->mayWriteToMemory()) + return true; + + // At least one of the accesses must be strided. + if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride)) + return true; + + // If dependence information is not available from LoopAccessInfo, + // conservatively assume the instructions can't be reordered. + if (!areDependencesValid()) + return false; + + // If we know there is a dependence from source to sink, assume the + // instructions can't be reordered. Otherwise, reordering is legal. + return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink); + } + + /// \brief Collect the dependences from LoopAccessInfo. + /// + /// We process the dependences once during the interleaved access analysis to + /// enable constant-time dependence queries. + void collectDependences() { + if (!areDependencesValid()) + return; + auto *Deps = LAI->getDepChecker().getDependences(); + for (auto Dep : *Deps) + Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI)); + } }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -1275,13 +1360,14 @@ DominatorTree *DT, TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, - LoopAccessAnalysis *LAA, + LoopAccessAnalysis *LAA, LoopInfo *LI, LoopVectorizationRequirements *R, LoopVectorizeHints *H) : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), - Requirements(R), Hints(H) {} + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), + InterleaveInfo(PSE, L, DT, LI), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), + Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1888,7 +1974,7 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, LI, &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); @@ -4997,6 +5083,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { LAI = &LAA->getInfo(TheLoop, getSymbolicStrides()); + InterleaveInfo.setLAI(LAI); auto &OptionalReport = LAI->getReport(); if (OptionalReport) emitAnalysis(VectorizationReport(*OptionalReport)); @@ -5111,7 +5198,17 @@ // Holds load/store instructions in program order. SmallVector AccessList; - for (auto *BB : TheLoop->getBlocks()) { + // Since it's desired that the load/store instructions be maintained in + // "program order" for the interleaved access analysis, we have to visit the + // blocks in the loop in reverse postorder (i.e., in a topological order). + // Such an ordering will ensure that any load/store that may be executed + // before a second load/store will precede the second load/store in the + // AccessList. + LoopBlocksDFS DFS(TheLoop); + DFS.perform(LI); + for (LoopBlocksDFS::RPOIterator I = DFS.beginRPO(), E = DFS.endRPO(); I != E; + ++I) { + BasicBlock *BB = *I; bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); for (auto &I : *BB) { @@ -5136,13 +5233,6 @@ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides); - // The factor of the corresponding interleave group. - unsigned Factor = std::abs(Stride); - - // Ignore the access if the factor is too small or too large. - if (Factor < 2 || Factor > MaxInterleaveGroupFactor) - continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -5156,26 +5246,42 @@ } } -// Analyze interleaved accesses and collect them into interleave groups. +// Analyze interleaved accesses and collect them into interleaved load and +// store groups. +// +// When generating code for an interleaved load group, we effectively hoist all +// loads in the group to the location of the first load in program order. When +// generating code for an interleaved store group, we sink all stores to the +// location of the last store. This code motion can change the order of load +// and store instructions and may break dependences. +// +// The code generation strategy mentioned above ensures that we won't violate +// any write-after-read (WAR) dependences. +// +// E.g., for the WAR dependence: a = A[i]; // (1) +// A[i] = b; // (2) // -// Notice that the vectorization on interleaved groups will change instruction -// orders and may break dependences. But the memory dependence check guarantees -// that there is no overlap between two pointers of different strides, element -// sizes or underlying bases. +// The store group of (2) is always inserted at or below (2), and the load +// group of (1) is always inserted at or above (1). Thus, the instructions will +// never be reordered. All other dependences are checked to ensure the +// correctness of the instruction reordering. // -// For pointers sharing the same stride, element size and underlying base, no -// need to worry about Read-After-Write dependences and Write-After-Read +// The algorithm visits all memory accesses in the loop in bottom-up program +// order. Program order is established by traversing the blocks in the loop in +// reverse postorder when collecting the accesses. +// +// We visit the memory accesses in bottom-up order because it can simplify the +// construction of store groups in the presence of write-after-write (WAW) // dependences. // -// E.g. The RAW dependence: A[i] = a; -// b = A[i]; -// This won't exist as it is a store-load forwarding conflict, which has -// already been checked and forbidden in the dependence check. +// E.g., for the WAW dependence: A[i] = a; // (1) +// A[i] = b; // (2) +// A[i + 1] = c; // (3) // -// E.g. The WAR dependence: a = A[i]; // (1) -// A[i] = b; // (2) -// The store group of (2) is always inserted at or below (2), and the load group -// of (1) is always inserted at or above (1). The dependence is safe. +// We will first create a store group with (3) and (2). (1) can't be added to +// this group because it and (2) are dependent. However, (1) can be grouped +// with other accesses that may precede it in program order. Note that a +// bottom-up order does not imply that WAW dependences should not be checked. void InterleavedAccessInfo::analyzeInterleaving( const ValueToValueMap &Strides) { DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); @@ -5187,6 +5293,9 @@ if (StrideAccesses.empty()) return; + // Collect the dependences in the loop. + collectDependences(); + // Holds all interleaved store groups temporarily. SmallSetVector StoreGroups; // Holds all interleaved load groups temporarily. @@ -5197,33 +5306,75 @@ // 1. A and B have the same stride. // 2. A and B have the same memory object size. // 3. B belongs to the group according to the distance. - // - // The bottom-up order can avoid breaking the Write-After-Write dependences - // between two pointers of the same base. - // E.g. A[i] = a; (1) - // A[i] = b; (2) - // A[i+1] = c (3) - // We form the group (2)+(3) in front, so (1) has to form groups with accesses - // above (1), which guarantees that (1) is always above (2). - for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; - ++I) { - Instruction *A = I->first; - StrideDescriptor DesA = I->second; - - InterleaveGroup *Group = getInterleaveGroup(A); - if (!Group) { - DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); - Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + for (auto AI = StrideAccesses.rbegin(), E = StrideAccesses.rend(); AI != E; + ++AI) { + Instruction *A = AI->first; + StrideDescriptor DesA = AI->second; + + // Initialize a group for A if it has an allowable stride. Even if we don't + // create a group for A, we continue with the bottom-up algorithm to ensure + // we don't break any of A's dependences. + InterleaveGroup *Group = nullptr; + if (isStrided(DesA.Stride)) { + Group = getInterleaveGroup(A); + if (!Group) { + DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); + Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + } + if (A->mayWriteToMemory()) + StoreGroups.insert(Group); + else + LoadGroups.insert(Group); } - if (A->mayWriteToMemory()) - StoreGroups.insert(Group); - else - LoadGroups.insert(Group); + for (auto BI = std::next(AI); BI != E; ++BI) { + Instruction *B = BI->first; + StrideDescriptor DesB = BI->second; + + // Our code motion strategy implies that we can't have dependences + // between accesses in an interleaved group and other accesses located + // between the first and last member of the group. Note that this also + // means that a group can't have more than one member at a given offset. + // The accesses in a group can have dependences with other accesses, but + // we must ensure we don't extend the boundaries of the group such that + // we encompass those dependent accesses. + // + // For example, assume we have the sequence of accesses shown below in a + // stride-2 loop: + // + // (1, 2) is a group | A[i] = a; // (1) + // | A[i-1] = b; // (2) | + // A[i-3] = c; // (3) + // A[i] = d; // (4) | (2, 4) is not a group + // + // Because accesses (2) and (3) are dependent, we can group (2) with (1) + // but not with (4). If we did, the dependent access (3) would be within + // the boundaries of the (2, 4) group. + if (!canReorderMemAccessesForInterleavedGroups(&*BI, &*AI)) { + + // If a dependence exists and B is already in a group, we know that B + // must be a store since B precedes A and WAR dependences are allowed. + // Thus, B would be sunk below A. We release B's group to prevent this + // illegal code motion. B will then be free to form another group with + // instructions that precede it. + if (isInterleaved(B)) { + InterleaveGroup *StoreGroup = getInterleaveGroup(B); + StoreGroups.remove(StoreGroup); + releaseGroup(StoreGroup); + } + + // If a dependence exists and B is not already in a group (or it was + // and we just released it), A might be hoisted above B (if A is a + // load) or another store might be sunk below B (if A is a store). In + // either case, we can't add additional instructions to A's group. A + // will only form a group with instructions that it precedes. + break; + } - for (auto II = std::next(I); II != E; ++II) { - Instruction *B = II->first; - StrideDescriptor DesB = II->second; + // At this point, we've checked for illegal code motion. If either A or B + // isn't strided, there's nothing left to do. + if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride)) + continue; // Ignore if B is already in a group or B is a different memory operation. if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) Index: test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses.ll +++ test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -555,4 +555,309 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } +; Check vectorization of interleaved access groups in the presence of +; dependences (PR27626). The following tests check that we don't reorder +; dependent loads and stores when generating code for interleaved access +; groups. Stores should be scalarized because the required code motion would +; break dependences, and the remaining interleaved load groups should have +; gaps. + +; PR27626_0: Ensure a strided store is not moved after a dependent (zero +; distance) strided load. + +; void PR27626_0(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i].x; +; } +; } + +; CHECK-LABEL: @PR27626_0( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 +; CHECK: store i32 %[[X1]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 +; CHECK: store i32 %[[X2]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 +; CHECK: store i32 %[[X3]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 +; CHECK: store i32 %[[X4]], {{.*}} + +%pair.i32 = type { i32, i32 } +define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 4 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32 *%p_i.y, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_1: Ensure a strided load is not moved before a dependent (zero +; distance) strided store. + +; void PR27626_1(struct pair *p, int n) { +; int s = 0; +; for (int i = 0; i < n; i++) { +; p[i].y = p[i].x; +; s += p[i].y +; } +; } + +; CHECK-LABEL: @PR27626_1( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 +; CHECK: store i32 %[[X1:.+]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 +; CHECK: store i32 %[[X2:.+]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 +; CHECK: store i32 %[[X3:.+]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 +; CHECK: store i32 %[[X4:.+]], {{.*}} +; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] + +define i32 @PR27626_1(%pair.i32 *%p, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %s = phi i32 [ %2, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32* %p_i.y, align 4 + %1 = load i32, i32* %p_i.y, align 4 + %2 = add nsw i32 %1, %s + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i32 [ %2, %for.body ] + ret i32 %3 +} + +; PR27626_2: Ensure a strided store is not moved after a dependent (negative +; distance) strided load. + +; void PR27626_2(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i - 1].x; +; } +; } + +; CHECK-LABEL: @PR27626_2( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 +; CHECK: store i32 %[[X1]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 +; CHECK: store i32 %[[X2]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 +; CHECK: store i32 %[[X3]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 +; CHECK: store i32 %[[X4]], {{.*}} + +define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i_minus_1 = add nuw nsw i64 %i, -1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 4 + %0 = load i32, i32* %p_i_minus_1.x, align 4 + store i32 %0, i32 *%p_i.y, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_3: Ensure a strided load is not moved before a dependent (negative +; distance) strided store. + +; void PR27626_3(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i + 1].y = p[i].x; +; s += p[i].y; +; } +; } + +; CHECK-LABEL: @PR27626_3( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] +; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 +; CHECK: store i32 %[[X1:.+]], {{.*}} +; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 +; CHECK: store i32 %[[X2:.+]], {{.*}} +; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 +; CHECK: store i32 %[[X3:.+]], {{.*}} +; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 +; CHECK: store i32 %[[X4:.+]], {{.*}} +; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} +; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] + +define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %s = phi i32 [ %2, %for.body ], [ 0, %entry ] + %i_plus_1 = add nuw nsw i64 %i, 1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32* %p_i_plus_1.y, align 4 + %1 = load i32, i32* %p_i.y, align 4 + %2 = add nsw i32 %1, %s + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i32 [ %2, %for.body ] + ret i32 %3 +} + +; PR27626_4: Ensure we form an interleaved group for strided stores in the +; presence of a write-after-write dependence. We create a group for +; (2) and (3) while excluding (1). + +; void PR27626_4(int *a, int x, int y, int z, int n) { +; for (int i = 0; i < n; i += 2) { +; a[i] = x; // (1) +; a[i] = y; // (2) +; a[i + 1] = z; // (3) +; } +; } + +; CHECK-LABEL: @PR27626_4( +; CHECK: vector.ph: +; CHECK: %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0 +; CHECK: %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0 +; CHECK: %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: vector.body: +; CHECK: store i32 %x, {{.*}} +; CHECK: store i32 %x, {{.*}} +; CHECK: store i32 %x, {{.*}} +; CHECK: store i32 %x, {{.*}} +; CHECK: %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> +; CHECK: store <8 x i32> %[[VEC]], {{.*}} + +define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i_plus_1 = add i64 %i, 1 + %a_i = getelementptr inbounds i32, i32* %a, i64 %i + %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1 + store i32 %x, i32* %a_i, align 4 + store i32 %y, i32* %a_i, align 4 + store i32 %z, i32* %a_i_plus_1, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_5: Ensure we do not form an interleaved group for strided stores in +; the presence of a write-after-write dependence. + +; void PR27626_5(int *a, int x, int y, int z, int n) { +; for (int i = 3; i < n; i += 2) { +; a[i - 1] = x; +; a[i - 3] = y; +; a[i] = z; +; } +; } + +; CHECK-LABEL: @PR27626_5( +; CHECK: vector.body: +; CHECK: store i32 %x, {{.*}} +; CHECK: store i32 %x, {{.*}} +; CHECK: store i32 %x, {{.*}} +; CHECK: store i32 %x, {{.*}} +; CHECK: store i32 %y, {{.*}} +; CHECK: store i32 %y, {{.*}} +; CHECK: store i32 %y, {{.*}} +; CHECK: store i32 %y, {{.*}} +; CHECK: store i32 %z, {{.*}} +; CHECK: store i32 %z, {{.*}} +; CHECK: store i32 %z, {{.*}} +; CHECK: store i32 %z, {{.*}} + +define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ] + %i_minus_1 = sub i64 %i, 1 + %i_minus_3 = sub i64 %i_minus_1, 2 + %a_i = getelementptr inbounds i32, i32* %a, i64 %i + %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1 + %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3 + store i32 %x, i32* %a_i_minus_1, align 4 + store i32 %y, i32* %a_i_minus_3, align 4 + store i32 %z, i32* %a_i, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + attributes #0 = { "unsafe-fp-math"="true" }