diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5427,6 +5427,20 @@ // lane 0 demanded or b) are uses which demand only lane 0 of their operand. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { + if (IntrinsicInst *II = dyn_cast(&I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::sideeffect: + case Intrinsic::experimental_noalias_scope_decl: + case Intrinsic::assume: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + if (TheLoop->hasLoopInvariantOperands(&I)) + addToWorklistIfAllowed(&I); + default: + break; + } + } + // If there's no pointer operand, there's nothing to do. auto *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) @@ -8943,6 +8957,36 @@ bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); + // Even if the instruction is not marked as uniform, there are certain + // intrinsic calls that can be effectively treated as such, so we check for + // them here. Conservatively, we only do this for scalable vectors, since + // for fixed-width VFs we can always fall back on full scalarization. + if (!IsUniform && Range.Start.isScalable() && isa(I)) { + switch (cast(I)->getIntrinsicID()) { + case Intrinsic::assume: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // For scalable vectors if one of the operands is variant then we still + // want to mark as uniform, which will generate one instruction for just + // the first lane of the vector. We can't scalarize the call in the same + // way as for fixed-width vectors because we don't know how many lanes + // there are. + // + // The reasons for doing it this way for scalable vectors are: + // 1. For the assume intrinsic generating the instruction for the first + // lane is still be better than not generating any at all. For + // example, the input may be a splat across all lanes. + // 2. For the lifetime start/end intrinsics the pointer operand only + // does anything useful when the input comes from a stack object, + // which suggests it should always be uniform. For non-stack objects + // the effect is to poison the object, which still allows us to + // remove the call. + IsUniform = true; + default: + break; + } + } + auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); setRecipe(I, Recipe); diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll --- a/llvm/test/Transforms/LoopVectorize/assume.ll +++ b/llvm/test/Transforms/LoopVectorize/assume.ll @@ -49,12 +49,8 @@ ; CHECK: vector.body: ; CHECK: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK: for.body: entry: %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll copy from llvm/test/Transforms/LoopVectorize/assume.ll copy to llvm/test/Transforms/LoopVectorize/scalable-assume.ll --- a/llvm/test/Transforms/LoopVectorize/assume.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll @@ -1,20 +1,14 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S | FileCheck %s +; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S | FileCheck %s define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) { ; CHECK-LABEL: @test1( ; CHECK: vector.body: -; CHECK: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* {{.*}}, align 4 -; CHECK: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* {{.*}}, align 4 -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP3]]) -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP5]]) -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]]) +; CHECK: [[FCMP1:%.*]] = fcmp ogt +; CHECK-NEXT: [[FCMP2:%.*]] = fcmp ogt +; CHECK-NEXT: [[FCMP1L0:%.*]] = extractelement [[FCMP1]], i32 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP1L0]]) +; CHECK-NEXT: [[FCMP2L0:%.*]] = extractelement [[FCMP2]], i32 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP2L0]]) entry: br label %for.body @@ -29,7 +23,7 @@ store float %add, float* %arrayidx5, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv, 1599 - br i1 %exitcond, label %for.end, label %for.body + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 for.end: ; preds = %for.body ret void @@ -41,47 +35,36 @@ %struct.data = type { float*, float* } -define void @test2(%struct.data* nocapture readonly %d) { +define void @test2(float *%a, float *%b) { ; CHECK-LABEL: @test2( ; CHECK: entry: -; CHECK: [[MASKCOND:%.*]] = icmp eq i64 %maskedptr, 0 -; CHECK: [[MASKCOND4:%.*]] = icmp eq i64 %maskedptr3, 0 +; CHECK: [[MASKCOND:%.*]] = icmp eq i64 %ptrint1, 0 +; CHECK: [[MASKCOND4:%.*]] = icmp eq i64 %ptrint2, 0 ; CHECK: vector.body: ; CHECK: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK: for.body: entry: - %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1 - %0 = load float*, float** %b, align 8 - %ptrint = ptrtoint float* %0 to i64 - %maskedptr = and i64 %ptrint, 31 - %maskcond = icmp eq i64 %maskedptr, 0 - %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0 - %1 = load float*, float** %a, align 8 - %ptrint2 = ptrtoint float* %1 to i64 - %maskedptr3 = and i64 %ptrint2, 31 - %maskcond4 = icmp eq i64 %maskedptr3, 0 + %ptrint1 = ptrtoint float* %a to i64 + %maskcond = icmp eq i64 %ptrint1, 0 + %ptrint2 = ptrtoint float* %b to i64 + %maskcond4 = icmp eq i64 %ptrint2, 0 br label %for.body for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] tail call void @llvm.assume(i1 %maskcond) - %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv - %2 = load float, float* %arrayidx, align 4 - %add = fadd float %2, 1.000000e+00 + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, 1.000000e+00 tail call void @llvm.assume(i1 %maskcond4) - %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv + %arrayidx5 = getelementptr inbounds float, float* %b, i64 %indvars.iv store float %add, float* %arrayidx5, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv, 1599 - br i1 %exitcond, label %for.end, label %for.body + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 for.end: ; preds = %for.body ret void @@ -90,28 +73,17 @@ ; Test case for PR43620. Make sure we can vectorize with predication in presence ; of assume calls. For now, check that we drop all assumes in predicated blocks ; in the vector body. -define void @predicated_assume(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) { +define void @predicated_assume(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %n) { ; Check that the vector.body does not contain any assumes. ; CHECK-LABEL: @predicated_assume( ; CHECK: vector.body: ; CHECK-NOT: llvm.assume ; CHECK: for.body: entry: - %cmp15 = icmp eq i32 %n, 0 - br i1 %cmp15, label %for.cond.cleanup, label %for.body.preheader - -for.body.preheader: ; preds = %entry - %0 = zext i32 %n to i64 br label %for.body -for.cond.cleanup.loopexit: ; preds = %if.end5 - br label %for.cond.cleanup - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - ret void - for.body: ; preds = %for.body.preheader, %if.end5 - %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end5 ] + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end5 ] %cmp1 = icmp ult i64 %indvars.iv, 495616 br i1 %cmp1, label %if.end5, label %if.else @@ -123,11 +95,17 @@ if.end5: ; preds = %for.body, %if.else %x.0 = phi float [ 4.200000e+01, %if.else ], [ 2.300000e+01, %for.body ] %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv - %1 = load float, float* %arrayidx, align 4 - %mul = fmul float %x.0, %1 + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %x.0, %0 %arrayidx7 = getelementptr inbounds float, float* %b, i64 %indvars.iv store float %mul, float* %arrayidx7, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %cmp = icmp eq i64 %indvars.iv.next, %0 - br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body + %cmp = icmp eq i64 %indvars.iv.next, %n + br i1 %cmp, label %for.cond.cleanup, label %for.body, !llvm.loop !0 + +for.cond.cleanup: ; preds = %if.end5, %entry + ret void } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll @@ -0,0 +1,81 @@ +; RUN: opt -S -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; Make sure we can vectorize loops which contain lifetime markers. + +define void @test(i32 *%d) { +; CHECK-LABEL: @test( +; CHECK: entry: +; CHECK: [[ALLOCA:%.*]] = alloca [1024 x i32], align 16 +; CHECK-NEXT: [[BC:%.*]] = bitcast [1024 x i32]* [[ALLOCA]] to i8* +; CHECK: vector.body: +; CHECK: call void @llvm.lifetime.end.p0i8(i64 4096, i8* [[BC]]) +; CHECK: store +; CHECK: call void @llvm.lifetime.start.p0i8(i64 4096, i8* [[BC]]) + +entry: + %arr = alloca [1024 x i32], align 16 + %0 = bitcast [1024 x i32]* %arr to i8* + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1 + %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 8 + store i32 100, i32* %arrayidx, align 8 + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond, label %for.body, label %for.end, !llvm.loop !0 + +for.end: + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1 + ret void +} + +; CHECK-LABEL: @testloopvariant( +; CHECK: entry: +; CHECK: [[ALLOCA:%.*]] = alloca [1024 x i32], align 16 +; CHECK: vector.ph: +; CHECK: [[TMP1:%.*]] = insertelement poison, [1024 x i32]* %arr, i32 0 +; CHECK-NEXT: [[SPLAT_ALLOCA:%.*]] = shufflevector [[TMP1]], poison, zeroinitializer +; CHECK: vector.body: +; CHECK: [[BC_ALLOCA:%.*]] = bitcast [[SPLAT_ALLOCA]] to +; CHECK-NEXT: [[ONE_LIFETIME:%.*]] = extractelement [[BC_ALLOCA]], i32 0 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4096, i8* [[ONE_LIFETIME]]) +; CHECK: store +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4096, i8* [[ONE_LIFETIME]]) + +define void @testloopvariant(i32 *%d) { +entry: + %arr = alloca [1024 x i32], align 16 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr [1024 x i32], [1024 x i32]* %arr, i32 0, i64 %indvars.iv + %1 = bitcast [1024 x i32]* %arr to i8* + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1 + %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv + %2 = load i32, i32* %arrayidx, align 8 + store i32 100, i32* %arrayidx, align 8 + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond, label %for.body, label %for.end, !llvm.loop !0 + +for.end: + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll b/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll @@ -0,0 +1,127 @@ +; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s + +define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) { +entry: + br label %for.body + +; CHECK-LABEL: @test1 +; CHECK: vector.body: +; CHECK: @llvm.experimental.noalias.scope.decl +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: for.body: +; CHECK: @llvm.experimental.noalias.scope.decl +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+02 + tail call void @llvm.experimental.noalias.scope.decl(metadata !0) + %add = fadd float %0, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %add, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, 1599 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5 + +for.end: ; preds = %for.body + ret void +} + +declare void @llvm.experimental.noalias.scope.decl(metadata) + +%struct.data = type { float*, float* } + +define void @test2(float* %a, float* %b) { +; CHECK-LABEL: @test2 +; CHECK: vector.body: +; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST:!.*]]) +; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST:!.*]]) +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: for.body: +; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST]]) +; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST]]) +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: ret void +entry: + %ptrint = ptrtoint float* %b to i64 + %maskcond = icmp eq i64 %ptrint, 0 + %ptrint2 = ptrtoint float* %a to i64 + %maskcond4 = icmp eq i64 %ptrint2, 0 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + tail call void @llvm.experimental.noalias.scope.decl(metadata !0) + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, 1.000000e+00 + tail call void @llvm.experimental.noalias.scope.decl(metadata !4) + %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %add, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, 1599 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5 + +for.end: ; preds = %for.body + ret void +} + +define void @predicated_noalias_scope_decl(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %n) { + +; Check that the vector.body still contains a llvm.experimental.noalias.scope.decl + +; CHECK-LABEL: @predicated_noalias_scope_decl( +; CHECK: vector.body: +; CHECK: call void @llvm.experimental.noalias.scope.decl +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: scalar.ph: +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: if.else: +; CHECK: call void @llvm.experimental.noalias.scope.decl +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: } + +entry: + br label %for.body + +for.body: ; preds = %entry, %if.end5 + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end5 ] + %cmp1 = icmp ult i64 %indvars.iv, 495616 + br i1 %cmp1, label %if.end5, label %if.else + +if.else: ; preds = %for.body + %cmp2 = icmp ult i64 %indvars.iv, 991232 + tail call void @llvm.experimental.noalias.scope.decl(metadata !0) + br label %if.end5 + +if.end5: ; preds = %for.body, %if.else + %x.0 = phi float [ 4.200000e+01, %if.else ], [ 2.300000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %x.0, %0 + %arrayidx7 = getelementptr inbounds float, float* %b, i64 %indvars.iv + store float %mul, float* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp eq i64 %indvars.iv.next, %n + br i1 %cmp, label %for.cond.cleanup, label %for.body, !llvm.loop !5 + +for.cond.cleanup: ; preds = %if.end5 + ret void +} + +!0 = !{ !1 } +!1 = distinct !{ !1, !2 } +!2 = distinct !{ !2 } +!3 = distinct !{ !3, !2 } +!4 = !{ !3 } +!5 = distinct !{!5, !6} +!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; CHECK: [[SCOPE0_LIST]] = !{[[SCOPE0:!.*]]} +; CHECK: [[SCOPE0]] = distinct !{[[SCOPE0]], [[SCOPE0_DOM:!.*]]} +; CHECK: [[SCOPE0_DOM]] = distinct !{[[SCOPE0_DOM]]} +; CHECK: [[SCOPE4_LIST]] = !{[[SCOPE4:!.*]]} +; CHECK: [[SCOPE4]] = distinct !{[[SCOPE4]], [[SCOPE0_DOM]]}