diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -811,7 +811,7 @@ LLVM_DEBUG(dbgs() << "Analyze temporal divergence: " << Context.print(&I) << "\n"); - if (!usesValueFromCycle(I, OuterDivCycle)) + if (!isAlwaysUniform(I) && !usesValueFromCycle(I, OuterDivCycle)) return; if (isAlwaysUniform(I)) diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp --- a/llvm/lib/Analysis/UniformityAnalysis.cpp +++ b/llvm/lib/Analysis/UniformityAnalysis.cpp @@ -73,8 +73,7 @@ template <> bool llvm::GenericUniformityAnalysisImpl::usesValueFromCycle( const Instruction &I, const Cycle &DefCycle) const { - if (isAlwaysUniform(I)) - return false; + assert(!isAlwaysUniform(I)); for (const Use &U : I.operands()) { if (auto *I = dyn_cast(&U)) { if (DefCycle.contains(I->getParent())) diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp --- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp @@ -102,7 +102,12 @@ if (!Op.isReg() || !Op.readsReg()) continue; auto Reg = Op.getReg(); - assert(Reg.isVirtual()); + + // FIXME: Physical registers need to be checked separately + // assert(Reg.isVirtual()); + if (Reg.isPhysical()) + continue; + auto *Def = F.getRegInfo().getVRegDef(Reg); if (DefCycle.contains(Def->getParent())) return true; diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir @@ -0,0 +1,229 @@ +# RUN: llc -mtriple=amdgcn-- -mcpu=gfx1030 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s + +--- | + define amdgpu_cs void @f2() #0 { + bb: + br label %bb1, !amdgpu.uniform !0 + + bb1: ; preds = %bb1, %bb + %i = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + %i2 = icmp eq i32 %i, 0 + br i1 %i2, label %bb3, label %bb1, !amdgpu.uniform !0 + + bb3: ; preds = %bb1 + %i4 = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + ret void + } + + define void @f1() #0 { + bb: + br label %bb1, !amdgpu.uniform !0 + + bb1: ; preds = %Flow, %bb + %phi.broken = phi i32 [ %6, %Flow ], [ 0, %bb ] + %i = phi i32 [ 0, %bb ], [ %4, %Flow ] + %i2 = phi i32 [ 0, %bb ], [ %3, %Flow ] + %i5 = icmp sge i32 %i, 0 + br label %bb3, !amdgpu.uniform !0 + + bb3: ; preds = %bb1, %bb3 + %i4 = phi i32 [ 0, %bb3 ], [ %i, %bb1 ] + br i1 %i5, label %bb6, label %bb3, !amdgpu.uniform !0 + + bb6: ; preds = %bb3 + %i4.lcssa = phi i32 [ %i4, %bb3 ] + %idxprom1 = sext i32 %i2 to i64 + %i7 = getelementptr [2 x i32], ptr null, i64 0, i64 %idxprom1, !amdgpu.uniform !0 + %i8 = load i32, ptr %i7, align 4 + %i9 = icmp slt i32 %i8, 0 + %0 = call { i1, i32 } @llvm.amdgcn.if.i32(i1 %i9) + %1 = extractvalue { i1, i32 } %0, 0 + %2 = extractvalue { i1, i32 } %0, 1 + br i1 %1, label %bb11, label %Flow + + Flow: ; preds = %bb11, %bb6 + %3 = phi i32 [ %i12, %bb11 ], [ undef, %bb6 ] + %4 = phi i32 [ %i4.lcssa, %bb11 ], [ undef, %bb6 ] + %5 = phi i1 [ false, %bb11 ], [ true, %bb6 ] + call void @llvm.amdgcn.end.cf.i32(i32 %2) + %6 = call i32 @llvm.amdgcn.if.break.i32(i1 %5, i32 %phi.broken) + %7 = call i1 @llvm.amdgcn.loop.i32(i32 %6) + br i1 %7, label %bb10, label %bb1 + + bb10: ; preds = %Flow + %.lcssa = phi i32 [ %6, %Flow ] + call void @llvm.amdgcn.end.cf.i32(i32 %.lcssa) + ret void + + bb11: ; preds = %bb6 + %i12 = or i32 %i2, 1 + br label %Flow, !amdgpu.uniform !0 + } + + declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32) #1 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn + declare { i1, i32 } @llvm.amdgcn.if.i32(i1) #2 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn + declare { i1, i32 } @llvm.amdgcn.else.i32.i32(i32) #2 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn memory(none) + declare i32 @llvm.amdgcn.if.break.i32(i1, i32) #3 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn + declare i1 @llvm.amdgcn.loop.i32(i32) #2 + + ; Function Attrs: convergent nocallback nofree nounwind willreturn + declare void @llvm.amdgcn.end.cf.i32(i32) #2 + + attributes #0 = { "target-cpu"="gfx1030" "uniform-work-group-size"="false" } + attributes #1 = { "target-cpu"="gfx1030" } + attributes #2 = { convergent nocallback nofree nounwind willreturn } + attributes #3 = { convergent nocallback nofree nounwind willreturn memory(none) } + + !0 = !{} + +... +--- +name: f2 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: MachineUniformityInfo for function: f2 + bb.1.bb: + + bb.2.bb1: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + ADJCALLSTACKUP 0, 0, implicit-def $scc + %45:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @llvm.amdgcn.raw.atomic.buffer.load.i32 + 4, target-flags(amdgpu-gotprel32-hi) @llvm.amdgcn.raw.atomic.buffer.load.i32 + 12, implicit-def $scc + %4:_(p0) = G_LOAD %45(p4) :: (dereferenceable invariant load (p0) from got, addrspace 4) + %5:_(p4) = G_IMPLICIT_DEF + %7:_(p4) = G_CONSTANT i64 36 + %10:_(s64) = G_IMPLICIT_DEF + %11:_(s32) = G_IMPLICIT_DEF + %42:_(s32) = G_CONSTANT i32 0 + $vgpr0 = COPY %42(s32) + $vgpr1 = COPY %42(s32) + $vgpr2 = COPY %42(s32) + $vgpr3 = COPY %42(s32) + $vgpr4 = COPY %42(s32) + $vgpr5 = COPY %42(s32) + $vgpr6 = COPY %42(s32) + %20:_(<4 x s32>) = COPY $private_rsrc_reg + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %20(<4 x s32>) + $sgpr4_sgpr5 = COPY %5(p4) + $sgpr6_sgpr7 = COPY %5(p4) + $sgpr8_sgpr9 = COPY %7(p4) + $sgpr10_sgpr11 = COPY %10(s64) + $sgpr12 = COPY %11(s32) + $sgpr13 = COPY %11(s32) + $sgpr14 = COPY %11(s32) + $sgpr15 = COPY %11(s32) + $vgpr31 = COPY %11(s32) + $sgpr30_sgpr31 = G_SI_CALL %4(p0), @llvm.amdgcn.raw.atomic.buffer.load.i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 + %1:_(s32) = COPY $vgpr0 + ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s1) = G_ICMP + %21:_(s1) = G_ICMP intpred(ne), %1(s32), %42 + G_BRCOND %21(s1), %bb.2 + G_BR %bb.3 + + bb.3.bb3: + ADJCALLSTACKUP 0, 0, implicit-def $scc + %44:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @llvm.amdgcn.raw.atomic.buffer.load.i32 + 4, target-flags(amdgpu-gotprel32-hi) @llvm.amdgcn.raw.atomic.buffer.load.i32 + 12, implicit-def $scc + %23:_(p0) = G_LOAD %44(p4) :: (dereferenceable invariant load (p0) from got, addrspace 4) + %24:_(p4) = G_IMPLICIT_DEF + %26:_(p4) = G_CONSTANT i64 36 + %29:_(s64) = G_IMPLICIT_DEF + %30:_(s32) = G_IMPLICIT_DEF + %43:_(s32) = G_CONSTANT i32 0 + $vgpr0 = COPY %43(s32) + $vgpr1 = COPY %43(s32) + $vgpr2 = COPY %43(s32) + $vgpr3 = COPY %43(s32) + $vgpr4 = COPY %43(s32) + $vgpr5 = COPY %43(s32) + $vgpr6 = COPY %43(s32) + %39:_(<4 x s32>) = COPY $private_rsrc_reg + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %39(<4 x s32>) + $sgpr4_sgpr5 = COPY %24(p4) + $sgpr6_sgpr7 = COPY %24(p4) + $sgpr8_sgpr9 = COPY %26(p4) + $sgpr10_sgpr11 = COPY %29(s64) + $sgpr12 = COPY %30(s32) + $sgpr13 = COPY %30(s32) + $sgpr14 = COPY %30(s32) + $sgpr15 = COPY %30(s32) + $vgpr31 = COPY %30(s32) + $sgpr30_sgpr31 = G_SI_CALL %23(p0), @llvm.amdgcn.raw.atomic.buffer.load.i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 + ADJCALLSTACKDOWN 0, 0, implicit-def $scc + S_ENDPGM 0 + +... + +--- +name: f1 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: MachineUniformityInfo for function: f1 + bb.1: + %3:_(s32) = G_CONSTANT i32 0 + %25:_(s32) = G_IMPLICIT_DEF + + bb.2: + %0:_(s32) = G_PHI %22(s32), %bb.5, %3(s32), %bb.1 + %1:_(s32) = G_PHI %3(s32), %bb.1, %20(s32), %bb.5 + %2:_(s32) = G_PHI %3(s32), %bb.1, %19(s32), %bb.5 + %36:_(s32) = G_CONSTANT i32 0 + %4:_(s1) = G_ICMP intpred(slt), %1(s32), %36 + + bb.3: + successors: %bb.4(0x04000000), %bb.3(0x7c000000) + + %5:_(s32) = G_PHI %38(s32), %bb.3, %1(s32), %bb.2 + %38:_(s32) = G_CONSTANT i32 0 + G_BRCOND %4(s1), %bb.3 + G_BR %bb.4 + + bb.4: + successors: %bb.7, %bb.5 + + %6:_(s32) = G_PHI %5(s32), %bb.3 + %33:_(s1) = G_CONSTANT i1 true + %7:_(s64) = G_SEXT %2(s32) + %39:_(s32) = G_CONSTANT i32 2 + %10:_(s64) = G_SHL %7, %39(s32) + %11:_(p0) = G_INTTOPTR %10(s64) + %13:_(s32) = G_LOAD %11(p0) :: (load (s32)) + %37:_(s32) = G_CONSTANT i32 0 + %14:sreg_32_xm0_xexec(s1) = G_ICMP intpred(slt), %13(s32), %37 + %16:sreg_32_xm0_xexec(s32) = SI_IF %14(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.7 + + bb.5: + successors: %bb.6(0x04000000), %bb.2(0x7c000000) + ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_PHI + ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_PHI + ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_PHI + ; CHECK-NOT: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break) + %19:_(s32) = G_PHI %18(s32), %bb.7, %25(s32), %bb.4 + %20:_(s32) = G_PHI %6(s32), %bb.7, %25(s32), %bb.4 + %21:_(s1) = G_PHI %34(s1), %bb.7, %33(s1), %bb.4 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32) + %22:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %21(s1), %0(s32) + SI_LOOP %22(s32), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.6 + + bb.6: + %24:_(s32) = G_PHI %22(s32), %bb.5 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %24(s32) + SI_RETURN + + bb.7: + %34:_(s1) = G_CONSTANT i1 false + %35:_(s32) = G_CONSTANT i32 1 + %18:_(s32) = G_OR %2, %35 + G_BR %bb.5 + +...