Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -833,7 +833,7 @@ public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, DominatorTree *DT) - : PSE(PSE), TheLoop(L), DT(DT) {} + : PSE(PSE), TheLoop(L), DT(DT), MayContainOOBMemAccs(false) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -862,6 +862,10 @@ return nullptr; } + /// \brief Returns true if an interleaved group may contain any out-of-bounds + /// memory accesses. + bool mayContainOOBMemAccs() const { return MayContainOOBMemAccs; } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. @@ -871,6 +875,11 @@ Loop *TheLoop; DominatorTree *DT; + /// True if the loop may contain interleaved groups with out-of-bounds + /// accesses. We ensure we don't speculatively access memory out-of-bounds by + /// peeling off the last vector iteration. + bool MayContainOOBMemAccs; + /// Holds the relationships between the members and the interleave group. DenseMap InterleaveGroupMap; @@ -1336,6 +1345,12 @@ return InterleaveInfo.getInterleaveGroup(Instr); } + /// \brief Returns true if an interleaved group requires a scalar iteration + /// to handle accesses with gaps. + bool requiresScalarIteration() const { + return InterleaveInfo.mayContainOOBMemAccs(); + } + unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } bool hasStride(Value *V) { return StrideSet.count(V); } @@ -2873,6 +2888,13 @@ // times the unroll factor (num of SIMD instructions). Constant *Step = ConstantInt::get(TC->getType(), VF * UF); Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); + + // If there is an interleaved group with gaps, we need to ensure that there + // will be at least one iteration of the scalar loop. Add one vector step to + // the remainder. + if (VF > 1 && Legal->requiresScalarIteration()) + R = Builder.CreateAdd(R, Step, "n.adj"); + VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); return VectorTripCount; @@ -5104,11 +5126,17 @@ if (Group->getNumMembers() != Group->getFactor()) releaseGroup(Group); - // Remove interleaved load groups that don't have the first and last member. - // This guarantees that we won't do speculative out of bounds loads. + // If there is an interleaved load group with gaps, we will need to execute + // at least one scalar iteration. This will ensure that we don't + // speculatively access memory out-of-bounds. Note that we only need to look + // for a member at index factor - 1, since every group must have a member at + // index zero. for (InterleaveGroup *Group : LoadGroups) - if (!Group->getMember(0) || !Group->getMember(Group->getFactor() - 1)) - releaseGroup(Group); + if (!Group->getMember(Group->getFactor() - 1)) { + DEBUG(dbgs() << "LV: Interleaved group requires scalar iteration.\n"); + MayContainOOBMemAccs = true; + break; + } } LoopVectorizationCostModel::VectorizationFactor Index: test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses.ll +++ test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -284,18 +284,24 @@ } ; Check vectorization on an interleaved load group of factor 2 with 1 gap -; (missing the load of odd elements). +; (missing the load of odd elements). Because the vectorized loop would +; speculatively access memory out-of-bounds, we must execute at least one +; iteration of the scalar loop. -; void even_load(int *A, int *B) { +; void even_load_static_tc(int *A, int *B) { ; for (unsigned i = 0; i < 1024; i+=2) ; B[i/2] = A[i] * 2; ; } -; CHECK-LABEL: @even_load( -; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 -; CHECK-NOT: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK-LABEL: @even_load_static_tc( +; CHECK: vector.body: +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: icmp eq i64 %index.next, 508 +; CHECK: middle.block: +; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph -define void @even_load(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { +define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { entry: br label %for.body @@ -315,6 +321,48 @@ br i1 %cmp, label %for.body, label %for.cond.cleanup } +; Check vectorization on an interleaved load group of factor 2 with 1 gap +; (missing the load of odd elements). Because the vectorized loop would +; speculatively access memory out-of-bounds, we must execute at least one +; iteration of the scalar loop. + +; void even_load_dynamic_tc(int *A, int *B, unsigned N) { +; for (unsigned i = 0; i < N; i+=2) +; B[i/2] = A[i] * 2; +; } + +; CHECK-LABEL: @even_load_dynamic_tc( +; CHECK: min.iters.checked: +; CHECK: %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3 +; CHECK: %n.adj = or i64 %n.mod.vf, 4 +; CHECK: %n.vec = sub i64 %[[N]], %n.adj +; CHECK: vector.body: +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: icmp eq i64 %index.next, %n.vec +; CHECK: middle.block: +; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph + +define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %tmp, 1 + %tmp1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, %N + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + ; Check vectorization on interleaved access groups identified from mixed ; loads/stores. ; void mixed_load2_store2(int *A, int *B) {