Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8906,6 +8906,42 @@ } } +bool VPRecipeBuilder::isUniformReplicate(Instruction *I, bool IsScalable) const { + switch (I->getOpcode()) { + case Instruction::Call: { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(cast(I), TLI); + switch (ID) { + case Intrinsic::sideeffect: + case Intrinsic::experimental_noalias_scope_decl: + return true; + case Intrinsic::assume: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + if (OrigLoop->hasLoopInvariantOperands(I)) + return true; + // For scalable vectors if one of the operands is variant then we still + // want to mark as uniform, which will generate one instruction for just + // the first lane of the vector. We can't scalarize the call in the same + // way as for fixed-width vectors because we don't know how many lanes + // there are. + // + // The reasons for doing it this way for scalable vectors are: + // 1. For the assume intrinsic generating the instruction for the first + // lane may still be better than not generating any at all. For + // example, the input may be a splat across all lanes. + // 2. For the lifetime start/end intrinsics the pointer operand only + // does anything useful when the input comes from an alloca, which + // suggests it should always be uniform as we don't support + // vectorizing allocas for scalable vectors. In the case when it is + // not from an alloca, then the intrinsic would have no effect + // anyway. + return IsScalable; + } + } + } + return false; +} + VPBasicBlock *VPRecipeBuilder::handleReplication( Instruction *I, VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan) { @@ -8916,6 +8952,9 @@ bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); + if (!IsUniform) + IsUniform = isUniformReplicate(I, Range.Start.isScalable()); + auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); setRecipe(I, Recipe); Index: llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -171,6 +171,10 @@ Instruction *I, VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan); + /// Returns true if \p I should be treated as a uniform instruction, which + /// also uses the \p IsScalable flag to determine the result. + bool isUniformReplicate(Instruction *I, bool IsScalable) const; + /// Add the incoming values from the backedge to reduction & first-order /// recurrence cross-iteration phis. void fixHeaderPhis(); Index: llvm/test/Transforms/LoopVectorize/assume.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/assume.ll +++ llvm/test/Transforms/LoopVectorize/assume.ll @@ -51,12 +51,8 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) ; CHECK: tail call void @llvm.assume(i1 [[MASKCOND4]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) entry: %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1 %0 = load float*, float** %b, align 8 Index: llvm/test/Transforms/LoopVectorize/scalable-assume.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-assume.ll @@ -0,0 +1,125 @@ +; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S | FileCheck %s + +define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) { +; CHECK-LABEL: @test1( +; CHECK: vector.body: +; CHECK: [[FCMP1:%.*]] = fcmp ogt +; CHECK-NEXT: [[FCMP2:%.*]] = fcmp ogt +; CHECK-NEXT: [[FCMP1L0:%.*]] = extractelement [[FCMP1]], i32 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP1L0]]) +; CHECK-NEXT: [[FCMP2L0:%.*]] = extractelement [[FCMP2]], i32 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP2L0]]) +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+02 + tail call void @llvm.assume(i1 %cmp1) + %add = fadd float %0, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %add, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, 1599 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + ret void +} + +declare void @llvm.assume(i1) #0 + +attributes #0 = { nounwind willreturn } + +%struct.data = type { float*, float* } + +define void @test2(%struct.data* nocapture readonly %d) { +; CHECK-LABEL: @test2( +; CHECK: entry: +; CHECK: [[MASKCOND:%.*]] = icmp eq i64 %maskedptr, 0 +; CHECK: [[MASKCOND4:%.*]] = icmp eq i64 %maskedptr3, 0 +; CHECK: vector.body: +; CHECK: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK: tail call void @llvm.assume(i1 [[MASKCOND4]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]]) +entry: + %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1 + %0 = load float*, float** %b, align 8 + %ptrint = ptrtoint float* %0 to i64 + %maskedptr = and i64 %ptrint, 31 + %maskcond = icmp eq i64 %maskedptr, 0 + %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0 + %1 = load float*, float** %a, align 8 + %ptrint2 = ptrtoint float* %1 to i64 + %maskedptr3 = and i64 %ptrint2, 31 + %maskcond4 = icmp eq i64 %maskedptr3, 0 + br label %for.body + + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + tail call void @llvm.assume(i1 %maskcond) + %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv + %2 = load float, float* %arrayidx, align 4 + %add = fadd float %2, 1.000000e+00 + tail call void @llvm.assume(i1 %maskcond4) + %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv + store float %add, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, 1599 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + ret void +} + +; Test case for PR43620. Make sure we can vectorize with predication in presence +; of assume calls. For now, check that we drop all assumes in predicated blocks +; in the vector body. +define void @predicated_assume(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) { +; Check that the vector.body does not contain any assumes. +; CHECK-LABEL: @predicated_assume( +; CHECK: vector.body: +; CHECK-NOT: llvm.assume +; CHECK: for.body: +entry: + %cmp15 = icmp eq i32 %n, 0 + br i1 %cmp15, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end5 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %if.end5 + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end5 ] + %cmp1 = icmp ult i64 %indvars.iv, 495616 + br i1 %cmp1, label %if.end5, label %if.else + +if.else: ; preds = %for.body + %cmp2 = icmp ult i64 %indvars.iv, 991232 + tail call void @llvm.assume(i1 %cmp2) + br label %if.end5 + +if.end5: ; preds = %for.body, %if.else + %x.0 = phi float [ 4.200000e+01, %if.else ], [ 2.300000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %x.0, %1 + %arrayidx7 = getelementptr inbounds float, float* %b, i64 %indvars.iv + store float %mul, float* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp eq i64 %indvars.iv.next, %0 + br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0 +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll @@ -0,0 +1,81 @@ +; RUN: opt -S -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; Make sure we can vectorize loops which contain lifetime markers. + +define void @test(i32 *%d) { +; CHECK-LABEL: @test( +; CHECK: entry: +; CHECK: [[ALLOCA:%.*]] = alloca [1024 x i32], align 16 +; CHECK-NEXT: [[BC:%.*]] = bitcast [1024 x i32]* [[ALLOCA]] to i8* +; CHECK: vector.body: +; CHECK: call void @llvm.lifetime.end.p0i8(i64 4096, i8* [[BC]]) +; CHECK: store +; CHECK: call void @llvm.lifetime.start.p0i8(i64 4096, i8* [[BC]]) + +entry: + %arr = alloca [1024 x i32], align 16 + %0 = bitcast [1024 x i32]* %arr to i8* + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1 + %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 8 + store i32 100, i32* %arrayidx, align 8 + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond, label %for.body, label %for.end, !llvm.loop !0 + +for.end: + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1 + ret void +} + +; CHECK-LABEL: @testloopvariant( +; CHECK: entry: +; CHECK: [[ALLOCA:%.*]] = alloca [1024 x i32], align 16 +; CHECK: vector.ph: +; CHECK: [[TMP1:%.*]] = insertelement poison, [1024 x i32]* %arr, i32 0 +; CHECK-NEXT: [[SPLAT_ALLOCA:%.*]] = shufflevector [[TMP1]], poison, zeroinitializer +; CHECK: vector.body: +; CHECK: [[BC_ALLOCA:%.*]] = bitcast [[SPLAT_ALLOCA]] to +; CHECK-NEXT: [[ONE_LIFETIME:%.*]] = extractelement [[BC_ALLOCA]], i32 0 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4096, i8* [[ONE_LIFETIME]]) +; CHECK: store +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4096, i8* [[ONE_LIFETIME]]) + +define void @testloopvariant(i32 *%d) { +entry: + %arr = alloca [1024 x i32], align 16 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr [1024 x i32], [1024 x i32]* %arr, i32 0, i64 %indvars.iv + %1 = bitcast [1024 x i32]* %arr to i8* + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1 + %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv + %2 = load i32, i32* %arrayidx, align 8 + store i32 100, i32* %arrayidx, align 8 + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond, label %for.body, label %for.end, !llvm.loop !0 + +for.end: + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll @@ -0,0 +1,142 @@ +; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s + +define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) { +entry: + br label %for.body + +; CHECK-LABEL: @test1 +; CHECK: vector.body: +; CHECK: @llvm.experimental.noalias.scope.decl +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: for.body: +; CHECK: @llvm.experimental.noalias.scope.decl +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+02 + tail call void @llvm.experimental.noalias.scope.decl(metadata !0) + %add = fadd float %0, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %add, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, 1599 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5 + +for.end: ; preds = %for.body + ret void +} + +declare void @llvm.experimental.noalias.scope.decl(metadata) + +%struct.data = type { float*, float* } + +define void @test2(%struct.data* nocapture readonly %d) { +entry: + %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1 + %0 = load float*, float** %b, align 8 + %ptrint = ptrtoint float* %0 to i64 + %maskedptr = and i64 %ptrint, 31 + %maskcond = icmp eq i64 %maskedptr, 0 + %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0 + %1 = load float*, float** %a, align 8 + %ptrint2 = ptrtoint float* %1 to i64 + %maskedptr3 = and i64 %ptrint2, 31 + %maskcond4 = icmp eq i64 %maskedptr3, 0 + br label %for.body + +; CHECK-LABEL: @test2 +; CHECK: vector.body: +; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST:!.*]]) +; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST:!.*]]) +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: for.body: +; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST]]) +; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST]]) +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + tail call void @llvm.experimental.noalias.scope.decl(metadata !0) + %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv + %2 = load float, float* %arrayidx, align 4 + %add = fadd float %2, 1.000000e+00 + tail call void @llvm.experimental.noalias.scope.decl(metadata !4) + %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv + store float %add, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, 1599 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5 + +for.end: ; preds = %for.body + ret void +} + +define void @predicated_noalias_scope_decl(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) { + +; Check that the vector.body still contains a llvm.experimental.noalias.scope.decl + +; CHECK-LABEL: @predicated_noalias_scope_decl( +; CHECK: vector.body: +; CHECK: call void @llvm.experimental.noalias.scope.decl +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: scalar.ph: +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: if.else: +; CHECK: call void @llvm.experimental.noalias.scope.decl +; CHECK-NOT: @llvm.experimental.noalias.scope.decl +; CHECK: } + +entry: + %cmp15 = icmp eq i32 %n, 0 + br i1 %cmp15, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %if.end5 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %if.end5 + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end5 ] + %cmp1 = icmp ult i64 %indvars.iv, 495616 + br i1 %cmp1, label %if.end5, label %if.else + +if.else: ; preds = %for.body + %cmp2 = icmp ult i64 %indvars.iv, 991232 + tail call void @llvm.experimental.noalias.scope.decl(metadata !0) + br label %if.end5 + +if.end5: ; preds = %for.body, %if.else + %x.0 = phi float [ 4.200000e+01, %if.else ], [ 2.300000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %1 = load float, float* %arrayidx, align 4 + %mul = fmul float %x.0, %1 + %arrayidx7 = getelementptr inbounds float, float* %b, i64 %indvars.iv + store float %mul, float* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp eq i64 %indvars.iv.next, %0 + br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !5 +} + +!0 = !{ !1 } +!1 = distinct !{ !1, !2 } +!2 = distinct !{ !2 } +!3 = distinct !{ !3, !2 } +!4 = !{ !3 } +!5 = distinct !{!5, !6} +!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; CHECK: [[SCOPE0_LIST]] = !{[[SCOPE0:!.*]]} +; CHECK: [[SCOPE0]] = distinct !{[[SCOPE0]], [[SCOPE0_DOM:!.*]]} +; CHECK: [[SCOPE0_DOM]] = distinct !{[[SCOPE0_DOM]]} +; CHECK: [[SCOPE4_LIST]] = !{[[SCOPE4:!.*]]} +; CHECK: [[SCOPE4]] = distinct !{[[SCOPE4]], [[SCOPE0_DOM]]}