diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -359,8 +359,7 @@ /// \returns Whether the tracked divergence state of \p DivVal changed. bool markDivergent(const InstructionT &I); bool markDivergent(ConstValueRefT DivVal); - bool markDefsDivergent(const InstructionT &Instr, - bool AllDefsDivergent = true); + bool markDefsDivergent(const InstructionT &Instr); /// \brief Propagate divergence to all instructions in the region. /// Divergence is seeded by calls to \p markDivergent. diff --git a/llvm/include/llvm/CodeGen/RegisterBankInfo.h b/llvm/include/llvm/CodeGen/RegisterBankInfo.h --- a/llvm/include/llvm/CodeGen/RegisterBankInfo.h +++ b/llvm/include/llvm/CodeGen/RegisterBankInfo.h @@ -587,6 +587,11 @@ /// Get the total number of register banks. unsigned getNumRegBanks() const { return NumRegBanks; } + /// Returns true if the register bank is considered divergent. + virtual bool isDivergentRegBank(const RegisterBank *RB) const { + return false; + } + /// Get a register bank that covers \p RC. /// /// \pre \p RC is a user-defined register class (as opposed as one diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -20,7 +20,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RegisterBank.h" #include "llvm/IR/CallingConv.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" @@ -557,6 +557,12 @@ return false; } + /// Returns true if the register is considered uniform. + virtual bool isUniformReg(const MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, Register Reg) const { + return false; + } + /// Physical registers that may be modified within a function but are /// guaranteed to be restored before any uses. This is useful for targets that /// have call sequences where a GOT register may be updated by the caller diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp --- a/llvm/lib/Analysis/UniformityAnalysis.cpp +++ b/llvm/lib/Analysis/UniformityAnalysis.cpp @@ -26,7 +26,7 @@ template <> bool llvm::GenericUniformityAnalysisImpl::markDefsDivergent( - const Instruction &Instr, bool AllDefsDivergent) { + const Instruction &Instr) { return markDivergent(&Instr); } diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp --- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp @@ -31,9 +31,10 @@ template <> bool llvm::GenericUniformityAnalysisImpl::markDefsDivergent( - const MachineInstr &Instr, bool AllDefsDivergent) { + const MachineInstr &Instr) { bool insertedDivergent = false; const auto &MRI = F.getRegInfo(); + const auto &RBI = *F.getSubtarget().getRegBankInfo(); const auto &TRI = *MRI.getTargetRegisterInfo(); for (auto &op : Instr.operands()) { if (!op.isReg() || !op.isDef()) @@ -41,11 +42,8 @@ if (!op.getReg().isVirtual()) continue; assert(!op.getSubReg()); - if (!AllDefsDivergent) { - auto *RC = MRI.getRegClassOrNull(op.getReg()); - if (RC && !TRI.isDivergentRegClass(RC)) - continue; - } + if (TRI.isUniformReg(MRI, RBI, op.getReg())) + continue; insertedDivergent |= markDivergent(op.getReg()); } return insertedDivergent; @@ -64,7 +62,8 @@ } if (uniformity == InstructionUniformity::NeverUniform) { - markDefsDivergent(instr, /* AllDefsDivergent = */ false); + if (markDivergent(instr)) + Worklist.push_back(&instr); } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -168,6 +168,8 @@ public: AMDGPURegisterBankInfo(const GCNSubtarget &STI); + bool isDivergentRegBank(const RegisterBank *RB) const override; + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -215,6 +215,10 @@ return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; } +bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { + return RB != &AMDGPU::SGPRRegBank; +} + unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -420,6 +420,7 @@ let Size = 8; let isBranch = 1; let hasSideEffects = 1; + let IsNeverUniform = 1; } } // End isTerminator = 1 diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -286,10 +286,17 @@ return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); } + // FIXME: SGPRs are assumed to be uniform, but this is not true for i1 SGPRs + // (such as VCC) which hold a wave-wide vector of boolean values. Examining + // just the register class is not suffcient; it needs to be combined with a + // value type. The next predicate isUniformReg() does this correctly. bool isDivergentRegClass(const TargetRegisterClass *RC) const override { return !isSGPRClass(RC); } + bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, + Register Reg) const override; + ArrayRef getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2865,6 +2865,37 @@ return MCRegister(); } +bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, + Register Reg) const { + // First check register banks, since they provide accurate information. + auto *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->isValid()) { + return !RBI.isDivergentRegBank(RB); + } + + // Then try to retrieve the register class, to be combined with the value type + // if available. + auto *RC = MRI.getRegClassOrNull(Reg); + if (!RC) + return false; + + // Only SGPRs can be uniform, but not always. + if (!isSGPRClass(RC)) + return false; + + // The register has SGPR class. It is uniform iff it is not type s1. The type + // is not known only in inline assembly, and presumably it is not s1 in the + // vast majority of cases. Returning an s1 def from inline assembly needs more + // work, but it is usually safe to assume a uniform SGPR at this point. + auto RegType = MRI.getType(Reg); + if (!RegType.isValid()) + return true; + + assert(RegType.isScalar() && "non-scalar types are divergent"); + return RegType != LLT::scalar(1); +} + ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir @@ -86,8 +86,7 @@ bb.0: liveins: $vgpr0 ; CHECK-LABEL: MachineUniformityInfo for function: asm_sgpr - ; FIXME: This is backwards - ; CHECK: DIVERGENT: %1 + ; CHECK-NOT: DIVERGENT: %1 %0:_(s32) = COPY $vgpr0 %2:vgpr_32 = COPY %0(s32) diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir @@ -12,11 +12,9 @@ %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %5:vreg_64 = COPY %3 %6:vreg_64 = COPY %3 - ; CHECK: DIVERGENT - ; CHECK-SAME: FLAT_ATOMIC_SWAP_RTN + ; CHECK: DIVERGENT{{.*}}FLAT_ATOMIC_SWAP_RTN %4:vgpr_32 = FLAT_ATOMIC_SWAP_RTN killed %5, %2, 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; CHECK: DIVERGENT - ; CHECK-SAME: FLAT_ATOMIC_SWAP_RTN + ; CHECK: DIVERGENT{{.*}}FLAT_ATOMIC_SWAP_RTN %7:vgpr_32 = FLAT_ATOMIC_SWAP_RTN killed %6, %2, 0, 1, implicit $exec, implicit $flat_scr ; No memopernads $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -36,8 +34,7 @@ %5:sreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1 %7:vreg_64 = COPY %4 %8:vreg_64 = COPY %5 - ; CHECK: DIVERGENT - ; CHECK-SAME: FLAT_ATOMIC_CMPSWAP_RTN + ; CHECK: DIVERGENT{{.*}}FLAT_ATOMIC_CMPSWAP_RTN %6:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN killed %7, killed %8, 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst seq_cst (s32)) %9:sreg_64_xexec = V_CMP_EQ_U32_e64 %6, %2, implicit $exec %10:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec @@ -57,8 +54,7 @@ %0:vgpr_32 = IMPLICIT_DEF %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %5:vreg_64 = COPY %3 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_INC_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_INC_RTN %4:vgpr_32 = GLOBAL_ATOMIC_INC_RTN killed %5, %2, 0, 1, implicit $exec :: (load store (s32), addrspace 1) $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -78,8 +74,7 @@ %5:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %7:vreg_64 = COPY %5 %8:vreg_64 = COPY %4 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_INC_X2_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_INC_X2_RTN %6:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN killed %7, killed %8, 0, 1, implicit $exec :: (load store (s64), addrspace 1) %9:vgpr_32 = COPY %6.sub1 %10:vgpr_32 = COPY %6.sub0 @@ -99,8 +94,7 @@ %0:vgpr_32 = IMPLICIT_DEF %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %5:vreg_64 = COPY %3 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_DEC_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_DEC_RTN %4:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN killed %5, %2, 0, 1, implicit $exec :: (load store (s32), addrspace 1) $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -121,8 +115,7 @@ %5:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %7:vreg_64 = COPY %5 %8:vreg_64 = COPY %4 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_DEC_X2_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_DEC_X2_RTN %6:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN killed %7, killed %8, 0, 1, implicit $exec :: (load store (s64), addrspace 1) %9:vgpr_32 = COPY %6.sub1 %10:vgpr_32 = COPY %6.sub0 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir rename from llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir rename to llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir @@ -49,7 +49,7 @@ %4:_(s32) = G_PHI %1(s32), %bb.0, %7(s32), %bb.2 %5:_(s1) = G_ICMP intpred(slt), %1(s32), %2(s32) - G_BRCOND %5(s1), %bb.3 + G_BRCOND %5(s1), %bb.3 ; Divergent exit G_BR %bb.2 bb.2: successors: %bb.4, %bb.1 @@ -57,7 +57,7 @@ %6:_(s32) = G_CONSTANT i32 1 %7:_(s32) = G_ADD %6(s32), %4(s32) %8:_(s1) = G_ICMP intpred(sgt), %2(s32), %1(s32) - G_BRCOND %8(s1), %bb.4 + G_BRCOND %8(s1), %bb.4 ; Divergent exit G_BR %bb.1 bb.3: successors: %bb.4, %bb.5 @@ -69,7 +69,7 @@ bb.4: successors: %bb.5 - %10:_(s32) = G_PHI %21(s32), %bb.3, %22(s32), %bb.2 + %10:_(s32) = G_PHI %21(s32), %bb.3, %22(s32), %bb.2 ; Temporal divergent phi G_BR %bb.5 bb.5: %11:_(s32) = G_PHI %20(s32), %bb.3, %22(s32), %bb.4 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir @@ -97,7 +97,11 @@ bb.0: ; CHECK-LABEL: MachineUniformityInfo for function: writelane ; CHECK: DIVERGENT: %4 - ; CHECK: DIVERGENT: %5 + + ; Note how %5 is the result of a vector compare, but it is reported as + ; uniform because it is stored in an sreg. + ; CHECK-NOT: DIVERGENT: %5 + %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir @@ -0,0 +1,122 @@ +# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s + +--- + +# CHECK-LABEL: MachineUniformityInfo for function: _amdgpu_ps_main +name: _amdgpu_ps_main +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_32, preferred-register: '' } + - { id: 1, class: sreg_64_xexec, preferred-register: '' } +liveins: + - { reg: '$sgpr4', virtual-reg: '%0' } +body: | + + ; CHECK-LABEL: BLOCK bb.0 + ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = COPY $vgpr + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + %2:_(s32) = COPY $sgpr2 + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = G_IMPLICIT_DEF + %5:_(s32) = G_CONSTANT i32 1 + %6:_(s64) = G_CONSTANT i64 0 + %7:_(s32) = G_FCONSTANT float -5.000000e-01 + %8:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.getpc) + %9:_(s64) = G_CONSTANT i64 -4294967296 + %10:_(s64) = G_AND %8, %9 + %11:_(s64) = G_ZEXT %2(s32) + %12:_(s64) = G_OR %10, %11 + %13:_(p4) = G_INTTOPTR %12(s64) + %14:_(<4 x s32>) = G_LOAD %13(p4) :: (invariant load (<4 x s32>), addrspace 4) + %15:_(s32) = G_CONSTANT i32 0 + %16:_(s32) = G_AMDGPU_S_BUFFER_LOAD %14(<4 x s32>), %15(s32), 0 :: (dereferenceable invariant load (s32)) + %17:_(s32) = G_FCONSTANT float 1.250000e-01 + %18:_(s32) = nnan nsz arcp contract afn reassoc G_FMUL %16, %17 + %19:_(s32) = nnan nsz arcp contract afn reassoc G_FRINT %18 + %20:_(s32) = nnan nsz arcp contract afn reassoc G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %19(s32) + + ; CHECK-LABEL: BLOCK bb.1 + bb.1: + successors: %bb.4(0x40000000), %bb.2(0x40000000) + + %21:_(s64) = G_PHI %1(s64), %bb.5, %6(s64), %bb.0 + %22:_(s32) = G_PHI %5(s32), %bb.0, %23(s32), %bb.5 + %24:_(s32) = G_PHI %7(s32), %bb.0, %25(s32), %bb.5 + %26:_(s1) = G_CONSTANT i1 true + %27:_(s32) = G_CONSTANT i32 31 + %28:_(s32) = G_AND %22, %27 + %29:_(s32) = G_CONSTANT i32 0 + %30:_(s1) = G_ICMP intpred(ne), %28(s32), %29 + G_BRCOND %30(s1), %bb.4 + G_BR %bb.2 + + ; CHECK-LABEL: BLOCK bb.2 + bb.2: + successors: %bb.3(0x40000000), %bb.5(0x40000000) + + %31:_(s32) = G_PHI %32(s32), %bb.4, %4(s32), %bb.1 + %33:_(s1) = G_PHI %34(s1), %bb.4, %26(s1), %bb.1 + %35:_(s1) = G_CONSTANT i1 true + %36:_(s1) = G_XOR %33, %35 + G_BRCOND %36(s1), %bb.5 + G_BR %bb.3 + + ; CHECK-LABEL: BLOCK bb.3 + bb.3: + successors: %bb.5(0x80000000) + + %37:_(s32) = G_FCONSTANT float 0x3FD99999A0000000 + %38:_(s32) = nnan nsz arcp contract afn reassoc G_FADD %24, %37 + G_BR %bb.5 + + ; CHECK-LABEL: BLOCK bb.4 + bb.4: + successors: %bb.2(0x80000000) + + %34:_(s1) = G_CONSTANT i1 false + %39:_(s32) = G_SITOFP %22(s32) + %40:_(s32) = nnan nsz arcp contract afn reassoc G_FMUL %39, %20 + %41:_(s32) = nnan nsz arcp contract afn reassoc G_FFLOOR %40 + %42:_(s32) = G_FNEG %41 + %43:_(s32) = G_FMA %42, %19, %39 + %44:_(s32) = G_FCONSTANT float 0x3F847AE140000000 + %45:_(s1) = G_FCMP floatpred(ugt), %43(s32), %44 + %46:_(s32) = G_FCONSTANT float 1.000000e+02 + %47:_(s32) = nnan nsz arcp contract afn reassoc G_FADD %24, %46 + %32:_(s32) = G_SELECT %45(s1), %24, %47 + G_BR %bb.2 + + ; CHECK-LABEL: BLOCK bb.5 + ; CHECK: TERMINATORS + ; CHECK: DIVERGENT: SI_LOOP + ; CHECK: DIVERGENT: G_BR + bb.5: + successors: %bb.6(0x40000000), %bb.1(0x40000000) + + %25:_(s32) = G_PHI %38(s32), %bb.3, %31(s32), %bb.2 + %48:_(s32) = G_SITOFP %22(s32) + %49:_(s1) = G_FCMP floatpred(oge), %48(s32), %3 + %50:_(s32) = G_CONSTANT i32 1 + %23:_(s32) = nuw nsw G_ADD %22, %50 + %51:_(s32) = G_CONSTANT i32 799 + %52:_(s1) = G_ICMP intpred(eq), %22(s32), %51 + %53:_(s1) = G_OR %49, %52 + %1:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %53(s1), %21(s64) + SI_LOOP %1(s64), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.6 + + ; CHECK-LABEL: BLOCK bb.6 + ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI + bb.6: + %54:_(s32) = G_PHI %25(s32), %bb.5 + %55:_(s64) = G_PHI %1(s64), %bb.5 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %55(s64) + %56:_(s32) = G_FCONSTANT float 1.000000e+00 + %57:_(s32) = G_FCONSTANT float 0.000000e+00 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 12, 15, %54(s32), %54(s32), %57(s32), %56(s32), -1, 0 + S_ENDPGM 0 + +... diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir rename from llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge.mir rename to llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir @@ -1,23 +1,18 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s +# This test was generated using SelectionDAG, where the compilation flow does +# not match the assumptions made in MachineUA. For now, this test mostly serves +# the purpose of catching in any crash when invoking MachineUA. The test should +# be deleted when it is clear that it is not actually testing anything useful. + --- # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vgpr_32(s32) = COPY $vgpr0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = V_CMP_GT_I32_e64 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = V_CMP_LT_I32_e64 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vreg_1 = COPY -# CHECK: DIVERGENT: %{{[0-9]*}}:sreg_64 = SI_IF -# CHECK: DIVERGENT: S_BRANCH %bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_32 = PHI %{{[0-9]*}}:sreg_32, %bb.0, %{{[0-9]*}}:sreg_32, %bb.1 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vreg_1 = PHI %{{[0-9]*}}:vreg_1, %bb.0, %{{[0-9]*}}:sreg_64, %bb.1 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = COPY %{{[0-9]*}}:vreg_1 -# CHECK: DIVERGENT: %{{[0-9]*}}:sreg_64 = SI_IF %{{[0-9]*}}:sreg_64, %bb.4 -# CHECK: DIVERGENT: S_BRANCH %bb.3 # CHECK-LABEL: BLOCK bb.3 # CHECK-LABEL: BLOCK bb.4 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vgpr_32 = PHI %{{[0-9]*}}:sreg_32, %bb.2, %{{[0-9]*}}:sreg_32, %bb.3 name: hidden_diverge tracksRegLiveness: true diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir rename from llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir rename to llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir @@ -1,11 +1,11 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s +# This test was generated using SelectionDAG, where the compilation flow does +# not match the assumptions made in MachineUA. For now, this test mostly serves +# the purpose of catching in any crash when invoking MachineUA. The test should +# be deleted when it is clear that it is not actually testing anything useful. + # CHECK-LABEL: MachineUniformityInfo for function: irreducible -# CHECK: CYCLES ASSSUMED DIVERGENT: -# CHECK: depth=1: entries(bb.2 bb.1) bb.3 bb.5 bb.4 -# CHECK: CYCLES WITH DIVERGENT EXIT: -# CHECK-DAG: depth=1: entries(bb.2 bb.1) bb.3 bb.5 bb.4 -# CHECK-DAG: depth=2: entries(bb.3 bb.1) bb.5 bb.4 --- name: irreducible diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll @@ -20,6 +20,7 @@ ; CHECK: DIVERGENT: br i1 %div.exitx, X: +; CHECK: DIVERGENT: %div.user = %div.user = add i32 %uni.inc, 5 ret void } @@ -47,16 +48,19 @@ ; CHECK: DIVERGENT: br i1 %div.exitx, X: +; CHECK: DIVERGENT: %div.user = %div.user = add i32 %uni.inc, 5 br i1 %uni.cond, label %G, label %Y Y: +; CHECK: DIVERGENT: %div.alsouser = %div.alsouser = add i32 %uni.inc, 5 ret void } -; temporal-uniform use of a valud, definition and users are carried by a surrounding divergent loop +; temporal-uniform use of a value, definition and users are carried by a +; surrounding divergent loop define amdgpu_kernel void @temporal_uniform_indivloop(i32 %n, i32 %a, i32 %b) #0 { ; CHECK-LABEL: for function 'temporal_uniform_indivloop': ; CHECK-NOT: DIVERGENT: %uni. @@ -73,7 +77,7 @@ H: %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] %uni.inc = add i32 %uni.merge.h, 1 - br i1 %uni.cond, label %X, label %H ; divergent branch + br i1 %uni.cond, label %X, label %H X: %uni.user = add i32 %uni.inc, 5 @@ -83,6 +87,7 @@ ; CHECK: DIVERGENT: br i1 %div.exity, Y: +; CHECK: DIVERGENT: %div.alsouser = %div.alsouser = add i32 %uni.inc, 5 ret void } @@ -111,6 +116,7 @@ br label %G G: +; C HECK: DIVERGENT: %div.user = %div.user = add i32 %uni.inc, 5 br i1 %uni.cond, label %G, label %Y @@ -118,7 +124,8 @@ ret void } -; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop +; temporal-divergent use of value carried by divergent loop, user is inside +; sibling loop, defs and use are carried by a uniform loop define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 { ; CHECK-LABEL: for function 'temporal_diverge_loopuser_nested': ; CHECK-NOT: DIVERGENT: %uni. @@ -141,6 +148,7 @@ br label %G G: +; C HECK: DIVERGENT: %div.user = %div.user = add i32 %uni.inc, 5 br i1 %uni.cond, label %G, label %Y