Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -833,7 +833,7 @@ public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, DominatorTree *DT) - : PSE(PSE), TheLoop(L), DT(DT) {} + : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -862,6 +862,10 @@ return nullptr; } + /// \brief Returns true if an interleaved group that may access memory + /// out-of-bounds requires a scalar epilogue iteration for correctness. + bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. @@ -871,6 +875,11 @@ Loop *TheLoop; DominatorTree *DT; + /// True if the loop may contain non-reversed interleaved groups with + /// out-of-bounds accesses. We ensure we don't speculatively access memory + /// out-of-bounds by executing at least one scalar epilogue iteration. + bool RequiresScalarEpilogue; + /// Holds the relationships between the members and the interleave group. DenseMap InterleaveGroupMap; @@ -1336,6 +1345,12 @@ return InterleaveInfo.getInterleaveGroup(Instr); } + /// \brief Returns true if an interleaved group requires a scalar iteration + /// to handle accesses with gaps. + bool requiresScalarEpilogue() const { + return InterleaveInfo.requiresScalarEpilogue(); + } + unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } bool hasStride(Value *V) { return StrideSet.count(V); } @@ -2867,12 +2882,26 @@ Value *TC = getOrCreateTripCount(L); IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); - // Now we need to generate the expression for N - (N % VF), which is - // the part that the vectorized body will execute. - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). + // Now we need to generate the expression for the part of the loop that the + // vectorized body will execute. This is equal to N - (N % Step) if scalar + // iterations are not required for correctness, or N - Step, otherwise. Step + // is equal to the vectorization factor (number of SIMD elements) times the + // unroll factor (number of SIMD instructions). Constant *Step = ConstantInt::get(TC->getType(), VF * UF); Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); + + // If there is a non-reversed interleaved group that may speculatively access + // memory out-of-bounds, we need to ensure that there will be at least one + // iteration of the scalar epilogue loop. Thus, if the step evenly divides + // the trip count, we set the remainder to be equal to the step. If the step + // does not evenly divide the trip count, no adjustment is necessary since + // there will already be scalar iterations. Note that the minimum iterations + // check ensures that N >= Step. + if (VF > 1 && Legal->requiresScalarEpilogue()) { + auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); + R = Builder.CreateSelect(IsZero, Step, R); + } + VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); return VectorTripCount; @@ -5104,11 +5133,20 @@ if (Group->getNumMembers() != Group->getFactor()) releaseGroup(Group); - // Remove interleaved load groups that don't have the first and last member. - // This guarantees that we won't do speculative out of bounds loads. + // If there is a non-reversed interleaved load group with gaps, we will need + // to execute at least one scalar epilogue iteration. This will ensure that + // we don't speculatively access memory out-of-bounds. Note that we only need + // to look for a member at index factor - 1, since every group must have a + // member at index zero. for (InterleaveGroup *Group : LoadGroups) - if (!Group->getMember(0) || !Group->getMember(Group->getFactor() - 1)) - releaseGroup(Group); + if (!Group->getMember(Group->getFactor() - 1)) { + if (Group->isReverse()) { + releaseGroup(Group); + } else { + DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); + RequiresScalarEpilogue = true; + } + } } LoopVectorizationCostModel::VectorizationFactor Index: test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses.ll +++ test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -284,18 +284,24 @@ } ; Check vectorization on an interleaved load group of factor 2 with 1 gap -; (missing the load of odd elements). +; (missing the load of odd elements). Because the vectorized loop would +; speculatively access memory out-of-bounds, we must execute at least one +; iteration of the scalar loop. -; void even_load(int *A, int *B) { +; void even_load_static_tc(int *A, int *B) { ; for (unsigned i = 0; i < 1024; i+=2) ; B[i/2] = A[i] * 2; ; } -; CHECK-LABEL: @even_load( -; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 -; CHECK-NOT: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK-LABEL: @even_load_static_tc( +; CHECK: vector.body: +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: icmp eq i64 %index.next, 508 +; CHECK: middle.block: +; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph -define void @even_load(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { +define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { entry: br label %for.body @@ -315,6 +321,93 @@ br i1 %cmp, label %for.body, label %for.cond.cleanup } +; Check vectorization on an interleaved load group of factor 2 with 1 gap +; (missing the load of odd elements). Because the vectorized loop would +; speculatively access memory out-of-bounds, we must execute at least one +; iteration of the scalar loop. + +; void even_load_dynamic_tc(int *A, int *B, unsigned N) { +; for (unsigned i = 0; i < N; i+=2) +; B[i/2] = A[i] * 2; +; } + +; CHECK-LABEL: @even_load_dynamic_tc( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3 +; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 +; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf +; CHECK: %n.vec = sub i64 %[[N]], %[[R]] +; CHECK: vector.body: +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: icmp eq i64 %index.next, %n.vec +; CHECK: middle.block: +; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph + +define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %tmp, 1 + %tmp1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, %N + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on a reverse interleaved load group of factor 2 with 1 +; gap and a reverse interleaved store group of factor 2. The interleaved load +; group should be removed since it has a gap and is reverse. + +; struct pair { +; int x; +; int y; +; }; +; +; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) { +; for (int i = 1023; i >= 0; i--) { +; int a = X + i; +; int b = A[i].y - i; +; B[i].x = a; +; B[i].y = b; +; } +; } + +; CHECK-LABEL: @load_gap_reverse( +; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8 +; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> + +%pair = type { i64, i64 } +define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ] + %0 = add nsw i64 %X, %i + %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0 + %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1 + %3 = load i64, i64* %2, align 8 + %4 = sub nsw i64 %3, %i + store i64 %0, i64* %1, align 8 + store i64 %4, i64* %2, align 8 + %i.next = add nsw i64 %i, -1 + %cond = icmp sgt i64 %i, 0 + br i1 %cond, label %for.body, label %for.exit + +for.exit: + ret void +} + ; Check vectorization on interleaved access groups identified from mixed ; loads/stores. ; void mixed_load2_store2(int *A, int *B) {