diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -19,8 +19,10 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" using namespace llvm; +using namespace PatternMatch; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -885,6 +887,12 @@ if (C->canTrap()) return false; } + + // We can predicate blocks with calls to assume, as long as we drop them in + // case we flatten the CFG via predication. + if (match(&I, m_Intrinsic())) + continue; + // We might be able to hoist the load. if (I.mayReadFromMemory()) { auto *LI = dyn_cast(&I); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -118,6 +118,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -153,6 +154,8 @@ using namespace llvm; +using namespace PatternMatch; + #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -7280,6 +7283,15 @@ } void VPReplicateRecipe::execute(VPTransformState &State) { + // If we flatten the control flow, drop assume calls. + if (match(Ingredient, m_Intrinsic()) && + State.CFG.PrevVPBB && + State.CFG.PrevBB == State.Builder.GetInsertBlock()) { + LLVM_DEBUG(dbgs() << "LV: Dropping " << *Ingredient + << " because we merged the containing block\n"); + return; + } + if (State.Instance) { // Generate a single instance. State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); // Insert scalar instance packing it into a vector. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -194,11 +194,12 @@ << " in BB:" << NewBB->getName() << '\n'); State->CFG.VPBB2IRBB[this] = NewBB; - State->CFG.PrevVPBB = this; for (VPRecipeBase &Recipe : Recipes) Recipe.execute(*State); + State->CFG.PrevVPBB = this; + VPValue *CBV; if (EnableVPlanNativePath && (CBV = getCondBit())) { Value *IRCBV = CBV->getUnderlyingValue(); diff --git a/llvm/test/Transforms/LoopVectorize/predicate-assume.ll b/llvm/test/Transforms/LoopVectorize/predicate-assume.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/predicate-assume.ll @@ -0,0 +1,102 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -force-vector-width=4 -debug -S %s 2>&1 | FileCheck %s + + +; Test case for PR43620. Make sure we can vectorize with predication in presence +; of assume calls. When generating code and flattening the CFG, we drop assume +; calls. + +; CHECK: digraph VPlan { +; CHECK: N0 [label = +; CHECK_NEXT: "for.body:\n" + +; CHECK_NEXT: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next\l" + +; CHECK_NEXT: "WIDEN\l" + +; CHECK_NEXT: " %cmp1 = icmp %indvars.iv, 495616\l" +; CHECK_NEXT: ] +; CHECK_NEXT: N0 -> N1 [ label=""] +; CHECK_NEXT: N1 [label = +; CHECK_NEXT: "if.else:\n" + +; CHECK_NEXT: "WIDEN\l" + +; CHECK_NEXT: " %cmp2 = icmp %indvars.iv, 991232\l" + +; CHECK_NEXT: "REPLICATE call %cmp2, @llvm.assume\l" +; CHECK_NEXT: ] +; CHECK_NEXT: N1 -> N2 [ label=""] +; CHECK_NEXT: N2 [label = +; CHECK_NEXT: "if.end5:\n" + +; CHECK_NEXT: "EMIT %vp6568 = not %vp5040\l" + +; CHECK_NEXT: "BLEND %x.0 = 4.200000e+01/%vp6568 2.300000e+01/%vp5040\l" + +; CHECK_NEXT: "CLONE %arrayidx = getelementptr %a, %indvars.iv\l" + +; CHECK_NEXT: "WIDEN %1 = load %arrayidx\l" + +; CHECK_NEXT: "WIDEN\l" + +; CHECK_NEXT: " %mul = fmul %x.0, %1\l" + +; CHECK_NEXT: "CLONE %arrayidx7 = getelementptr %b, %indvars.iv\l" + +; CHECK_NEXT: "WIDEN store %mul, %arrayidx7\l" +; CHECK_NEXT: ] +; CHECK_NEXT:} + + +define void @foo(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) { +; CHECK-LABEL: @foo( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x float> , <4 x float> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[PREDPHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP14]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], %n.vec +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop !0 +; +entry: + %cmp15 = icmp eq i32 %n, 0 + br i1 %cmp15, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end5 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %if.end5 + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end5 ] + %cmp1 = icmp ult i64 %indvars.iv, 495616 + br i1 %cmp1, label %if.end5, label %if.else + +if.else: ; preds = %for.body + %cmp2 = icmp ult i64 %indvars.iv, 991232 + tail call void @llvm.assume(i1 %cmp2) + br label %if.end5 + +if.end5: ; preds = %for.body, %if.else + %x.0 = phi float [ 4.200000e+01, %if.else ], [ 2.300000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %x.0, %1 + %arrayidx7 = getelementptr inbounds float, float* %b, i64 %indvars.iv + store float %mul, float* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp eq i64 %indvars.iv.next, %0 + br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body +} + +; Function Attrs: nounwind willreturn +declare void @llvm.assume(i1)