diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -359,8 +359,7 @@ /// \returns Whether the tracked divergence state of \p DivVal changed. bool markDivergent(const InstructionT &I); bool markDivergent(ConstValueRefT DivVal); - bool markDefsDivergent(const InstructionT &Instr, - bool AllDefsDivergent = true); + bool markDefsDivergent(const InstructionT &Instr); /// \brief Propagate divergence to all instructions in the region. /// Divergence is seeded by calls to \p markDivergent. diff --git a/llvm/include/llvm/CodeGen/RegisterBankInfo.h b/llvm/include/llvm/CodeGen/RegisterBankInfo.h --- a/llvm/include/llvm/CodeGen/RegisterBankInfo.h +++ b/llvm/include/llvm/CodeGen/RegisterBankInfo.h @@ -587,6 +587,11 @@ /// Get the total number of register banks. unsigned getNumRegBanks() const { return NumRegBanks; } + /// Returns true if the register bank is considered divergent. + virtual bool isDivergentRegBank(const RegisterBank *RB) const { + return false; + } + /// Get a register bank that covers \p RC. /// /// \pre \p RC is a user-defined register class (as opposed as one diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -20,7 +20,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RegisterBank.h" #include "llvm/IR/CallingConv.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" @@ -557,6 +557,12 @@ return false; } + /// Returns true if the register is considered uniform. + virtual bool isUniformReg(const MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, Register Reg) const { + return false; + } + /// Physical registers that may be modified within a function but are /// guaranteed to be restored before any uses. This is useful for targets that /// have call sequences where a GOT register may be updated by the caller diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp --- a/llvm/lib/Analysis/UniformityAnalysis.cpp +++ b/llvm/lib/Analysis/UniformityAnalysis.cpp @@ -26,7 +26,7 @@ template <> bool llvm::GenericUniformityAnalysisImpl::markDefsDivergent( - const Instruction &Instr, bool AllDefsDivergent) { + const Instruction &Instr) { return markDivergent(&Instr); } diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp --- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp @@ -31,9 +31,10 @@ template <> bool llvm::GenericUniformityAnalysisImpl::markDefsDivergent( - const MachineInstr &Instr, bool AllDefsDivergent) { + const MachineInstr &Instr) { bool insertedDivergent = false; const auto &MRI = F.getRegInfo(); + const auto &RBI = *F.getSubtarget().getRegBankInfo(); const auto &TRI = *MRI.getTargetRegisterInfo(); for (auto &op : Instr.operands()) { if (!op.isReg() || !op.isDef()) @@ -41,11 +42,8 @@ if (!op.getReg().isVirtual()) continue; assert(!op.getSubReg()); - if (!AllDefsDivergent) { - auto *RC = MRI.getRegClassOrNull(op.getReg()); - if (RC && !TRI.isDivergentRegClass(RC)) - continue; - } + if (TRI.isUniformReg(MRI, RBI, op.getReg())) + continue; insertedDivergent |= markDivergent(op.getReg()); } return insertedDivergent; @@ -64,7 +62,8 @@ } if (uniformity == InstructionUniformity::NeverUniform) { - markDefsDivergent(instr, /* AllDefsDivergent = */ false); + if (markDivergent(instr)) + Worklist.push_back(&instr); } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -168,6 +168,8 @@ public: AMDGPURegisterBankInfo(const GCNSubtarget &STI); + bool isDivergentRegBank(const RegisterBank *RB) const override; + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -215,6 +215,10 @@ return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; } +bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { + return RB != &AMDGPU::SGPRRegBank; +} + unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -420,6 +420,7 @@ let Size = 8; let isBranch = 1; let hasSideEffects = 1; + let IsNeverUniform = 1; } } // End isTerminator = 1 diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -286,10 +286,17 @@ return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); } + // FIXME: SGPRs are assumed to be uniform, but this is not true for i1 SGPRs + // (such as VCC) which hold a wave-wide vector of boolean values. Examining + // just the register class is not suffcient; it needs to be combined with a + // value type. The next predicate isUniformReg() does this correctly. bool isDivergentRegClass(const TargetRegisterClass *RC) const override { return !isSGPRClass(RC); } + bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, + Register Reg) const override; + ArrayRef getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2865,6 +2865,16 @@ return MCRegister(); } +bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, + Register Reg) const { + auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo()); + if (!RB) + return false; + + return !RBI.isDivergentRegBank(RB); +} + ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir @@ -86,8 +86,7 @@ bb.0: liveins: $vgpr0 ; CHECK-LABEL: MachineUniformityInfo for function: asm_sgpr - ; FIXME: This is backwards - ; CHECK: DIVERGENT: %1 + ; CHECK-NOT: DIVERGENT: %1 %0:_(s32) = COPY $vgpr0 %2:vgpr_32 = COPY %0(s32) diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir @@ -12,11 +12,9 @@ %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %5:vreg_64 = COPY %3 %6:vreg_64 = COPY %3 - ; CHECK: DIVERGENT - ; CHECK-SAME: FLAT_ATOMIC_SWAP_RTN + ; CHECK: DIVERGENT{{.*}}FLAT_ATOMIC_SWAP_RTN %4:vgpr_32 = FLAT_ATOMIC_SWAP_RTN killed %5, %2, 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; CHECK: DIVERGENT - ; CHECK-SAME: FLAT_ATOMIC_SWAP_RTN + ; CHECK: DIVERGENT{{.*}}FLAT_ATOMIC_SWAP_RTN %7:vgpr_32 = FLAT_ATOMIC_SWAP_RTN killed %6, %2, 0, 1, implicit $exec, implicit $flat_scr ; No memopernads $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -36,8 +34,7 @@ %5:sreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1 %7:vreg_64 = COPY %4 %8:vreg_64 = COPY %5 - ; CHECK: DIVERGENT - ; CHECK-SAME: FLAT_ATOMIC_CMPSWAP_RTN + ; CHECK: DIVERGENT{{.*}}FLAT_ATOMIC_CMPSWAP_RTN %6:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN killed %7, killed %8, 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst seq_cst (s32)) %9:sreg_64_xexec = V_CMP_EQ_U32_e64 %6, %2, implicit $exec %10:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec @@ -57,8 +54,7 @@ %0:vgpr_32 = IMPLICIT_DEF %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %5:vreg_64 = COPY %3 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_INC_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_INC_RTN %4:vgpr_32 = GLOBAL_ATOMIC_INC_RTN killed %5, %2, 0, 1, implicit $exec :: (load store (s32), addrspace 1) $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -78,8 +74,7 @@ %5:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %7:vreg_64 = COPY %5 %8:vreg_64 = COPY %4 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_INC_X2_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_INC_X2_RTN %6:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN killed %7, killed %8, 0, 1, implicit $exec :: (load store (s64), addrspace 1) %9:vgpr_32 = COPY %6.sub1 %10:vgpr_32 = COPY %6.sub0 @@ -99,8 +94,7 @@ %0:vgpr_32 = IMPLICIT_DEF %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %5:vreg_64 = COPY %3 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_DEC_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_DEC_RTN %4:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN killed %5, %2, 0, 1, implicit $exec :: (load store (s32), addrspace 1) $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -121,8 +115,7 @@ %5:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %7:vreg_64 = COPY %5 %8:vreg_64 = COPY %4 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_DEC_X2_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_DEC_X2_RTN %6:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN killed %7, killed %8, 0, 1, implicit $exec :: (load store (s64), addrspace 1) %9:vgpr_32 = COPY %6.sub1 %10:vgpr_32 = COPY %6.sub0 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir rename from llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir rename to llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir @@ -49,7 +49,7 @@ %4:_(s32) = G_PHI %1(s32), %bb.0, %7(s32), %bb.2 %5:_(s1) = G_ICMP intpred(slt), %1(s32), %2(s32) - G_BRCOND %5(s1), %bb.3 + G_BRCOND %5(s1), %bb.3 ; Divergent exit G_BR %bb.2 bb.2: successors: %bb.4, %bb.1 @@ -57,7 +57,7 @@ %6:_(s32) = G_CONSTANT i32 1 %7:_(s32) = G_ADD %6(s32), %4(s32) %8:_(s1) = G_ICMP intpred(sgt), %2(s32), %1(s32) - G_BRCOND %8(s1), %bb.4 + G_BRCOND %8(s1), %bb.4 ; Divergent exit G_BR %bb.1 bb.3: successors: %bb.4, %bb.5 @@ -69,7 +69,7 @@ bb.4: successors: %bb.5 - %10:_(s32) = G_PHI %21(s32), %bb.3, %22(s32), %bb.2 + %10:_(s32) = G_PHI %21(s32), %bb.3, %22(s32), %bb.2 ; Temporal divergent phi G_BR %bb.5 bb.5: %11:_(s32) = G_PHI %20(s32), %bb.3, %22(s32), %bb.4 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir @@ -97,7 +97,11 @@ bb.0: ; CHECK-LABEL: MachineUniformityInfo for function: writelane ; CHECK: DIVERGENT: %4 - ; CHECK: DIVERGENT: %5 + + ; Note how %5 is the result of a vector compare, but it is reported as + ; uniform because it is stored in an sreg. + ; CHECK-NOT: DIVERGENT: %5 + %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir @@ -0,0 +1,385 @@ +# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s + +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_diverge +name: temporal_diverge +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %15:_(s64) = G_CONSTANT i64 0 + + bb.2: + successors: %bb.3, %bb.2 + + %11:_(s64) = G_PHI %12(s64), %bb.2, %15(s64), %bb.1 + %18:_(s1) = G_CONSTANT i1 false + %12:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %18(s1), %11(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %12(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.3 + + bb.3: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %14:_(s64) = G_PHI %12(s64), %bb.2 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s64) + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: phi_at_exit +name: phi_at_exit +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + successors: %bb.2, %bb.3 + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (load (s32), addrspace 4) + %11:_(s32) = G_CONSTANT i32 0 + %12:_(s1) = G_ICMP intpred(sge), %10(s32), %11 + G_BRCOND %12(s1), %bb.3 + G_BR %bb.2 + + bb.2: + %24:_(s64) = G_CONSTANT i64 0 + %14:_(s1) = G_CONSTANT i1 false + G_BR %bb.4 + + bb.3: + G_BR %bb.6 + + bb.4: + successors: %bb.5, %bb.4 + + %15:_(s64) = G_PHI %24(s64), %bb.2, %16(s64), %bb.4 + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.5 + + bb.5: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %18:_(s64) = G_PHI %16(s64), %bb.4 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + G_BR %bb.3 + + bb.6: + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: phi_after_exit +name: phi_after_exit +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + successors: %bb.2, %bb.3 + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %11:_(s32) = G_CONSTANT i32 0 + %12:_(s1) = G_ICMP intpred(sge), %10(s32), %11 + G_BRCOND %12(s1), %bb.3 + G_BR %bb.2 + + bb.2: + %24:_(s64) = G_CONSTANT i64 0 + %14:_(s1) = G_CONSTANT i1 false + G_BR %bb.4 + + bb.3: + G_BR %bb.6 + + bb.4: + successors: %bb.5, %bb.4 + + %15:_(s64) = G_PHI %24(s64), %bb.2, %16(s64), %bb.4 + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.5 + + bb.5: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %18:_(s64) = G_PHI %16(s64), %bb.4 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + G_BR %bb.3 + + bb.6: + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_diverge_inloop +name: temporal_diverge_inloop +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %12:_(s32) = G_CONSTANT i32 0 + %13:_(s1) = G_ICMP intpred(slt), %10(s32), %12 + + bb.2: + %25:_(s64) = G_CONSTANT i64 0 + + bb.3: + successors: %bb.4, %bb.3 + + %15:_(s64) = G_PHI %25(s64), %bb.2, %16(s64), %bb.3 + %24:_(s1) = G_CONSTANT i1 false + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.4 + + bb.4: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + successors: %bb.5, %bb.2 + + %18:_(s64) = G_PHI %16(s64), %bb.3 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + G_BRCOND %13(s1), %bb.2 + G_BR %bb.5 + + bb.5: + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_uniform_indivloop +name: temporal_uniform_indivloop +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %19:_(s64) = G_CONSTANT i64 0 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %12:_(s32) = G_CONSTANT i32 0 + %13:_(s1) = G_ICMP intpred(sge), %10(s32), %12 + + bb.2: + %15:_(s64) = G_PHI %16(s64), %bb.4, %19(s64), %bb.1 + %24:_(s1) = G_CONSTANT i1 true + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64) + + bb.3: + successors: %bb.4, %bb.3 + + G_BRCOND %13(s1), %bb.3 + G_BR %bb.4 + + bb.4: + successors: %bb.5, %bb.2 + + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.5 + + bb.5: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %18:_(s64) = G_PHI %16(s64), %bb.4 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_diverge_loopuser +name: temporal_diverge_loopuser +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %19:_(s64) = G_CONSTANT i64 0 + + bb.2: + successors: %bb.3, %bb.2 + + %10:_(s64) = G_PHI %11(s64), %bb.2, %19(s64), %bb.1 + %24:_(s1) = G_CONSTANT i1 false + %11:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %10(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %11(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.3 + + bb.3: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + ; CHECK-NOT: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %13:_(s64) = G_PHI %11(s64), %bb.2 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %13(s64) + %14:_(p4) = COPY %3(p4) + %15:_(s64) = G_CONSTANT i64 40 + %16:_(p4) = G_PTR_ADD %14, %15(s64) + %17:_(s32) = G_LOAD %16(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %25:_(s32) = G_CONSTANT i32 0 + %18:_(s1) = G_ICMP intpred(slt), %17(s32), %25 + + bb.4: + successors: %bb.5, %bb.4 + + G_BRCOND %18(s1), %bb.4 + G_BR %bb.5 + + bb.5: + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_diverge_loopuser_nested +name: temporal_diverge_loopuser_nested +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %12:_(s32) = G_CONSTANT i32 0 + %13:_(s1) = G_ICMP intpred(sge), %10(s32), %12 + + bb.2: + %23:_(s64) = G_CONSTANT i64 0 + + bb.3: + successors: %bb.4, %bb.3 + + %15:_(s64) = G_PHI %23(s64), %bb.2, %16(s64), %bb.3 + %25:_(s1) = G_CONSTANT i1 false + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %25(s1), %15(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.4 + + bb.4: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %18:_(s64) = G_PHI %16(s64), %bb.3 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + + bb.5: + + bb.6: + successors: %bb.8, %bb.5 + + G_BRCOND %13(s1), %bb.8 + G_BR %bb.5 + + bb.7: + S_ENDPGM 0 + + bb.8: + successors: %bb.7, %bb.2 + + %24:_(s1) = G_CONSTANT i1 false + G_BRCOND %24(s1), %bb.7 + G_BR %bb.2 + +... diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir rename from llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge.mir rename to llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir @@ -1,23 +1,18 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s +# This test was generated using SelectionDAG, where the compilation flow does +# not match the assumptions made in MachineUA. For now, this test mostly serves +# the purpose of catching in any crash when invoking MachineUA. The test should +# be deleted when it is clear that it is not actually testing anything useful. + --- # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vgpr_32(s32) = COPY $vgpr0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = V_CMP_GT_I32_e64 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = V_CMP_LT_I32_e64 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vreg_1 = COPY -# CHECK: DIVERGENT: %{{[0-9]*}}:sreg_64 = SI_IF -# CHECK: DIVERGENT: S_BRANCH %bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_32 = PHI %{{[0-9]*}}:sreg_32, %bb.0, %{{[0-9]*}}:sreg_32, %bb.1 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vreg_1 = PHI %{{[0-9]*}}:vreg_1, %bb.0, %{{[0-9]*}}:sreg_64, %bb.1 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = COPY %{{[0-9]*}}:vreg_1 -# CHECK: DIVERGENT: %{{[0-9]*}}:sreg_64 = SI_IF %{{[0-9]*}}:sreg_64, %bb.4 -# CHECK: DIVERGENT: S_BRANCH %bb.3 # CHECK-LABEL: BLOCK bb.3 # CHECK-LABEL: BLOCK bb.4 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vgpr_32 = PHI %{{[0-9]*}}:sreg_32, %bb.2, %{{[0-9]*}}:sreg_32, %bb.3 name: hidden_diverge tracksRegLiveness: true diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir rename from llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir rename to llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir @@ -1,11 +1,11 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s +# This test was generated using SelectionDAG, where the compilation flow does +# not match the assumptions made in MachineUA. For now, this test mostly serves +# the purpose of catching in any crash when invoking MachineUA. The test should +# be deleted when it is clear that it is not actually testing anything useful. + # CHECK-LABEL: MachineUniformityInfo for function: irreducible -# CHECK: CYCLES ASSSUMED DIVERGENT: -# CHECK: depth=1: entries(bb.2 bb.1) bb.3 bb.5 bb.4 -# CHECK: CYCLES WITH DIVERGENT EXIT: -# CHECK-DAG: depth=1: entries(bb.2 bb.1) bb.3 bb.5 bb.4 -# CHECK-DAG: depth=2: entries(bb.3 bb.1) bb.5 bb.4 --- name: irreducible diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll @@ -101,10 +101,12 @@ ; CHECK: DIVERGENT: br i1 %div.exitx, X: +; CHECK: DIVERGENT: %div.user = %div.user = add i32 %uni.inc, 5 br i1 %uni.cond, label %G, label %Y Y: +; CHECK: DIVERGENT: %div.alsouser = %div.alsouser = add i32 %uni.inc, 5 ret void } @@ -128,7 +130,7 @@ H: %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] %uni.inc = add i32 %uni.merge.h, 1 - br i1 %uni.cond, label %X, label %H ; divergent branch + br i1 %uni.cond, label %X, label %H X: %uni.user = add i32 %uni.inc, 5 @@ -167,6 +169,7 @@ br label %G G: +; C HECK: DIVERGENT: %div.user = %div.user = add i32 %uni.inc, 5 br i1 %uni.cond, label %G, label %Y ; CHECK: DIVERGENT: %div.user = @@ -175,7 +178,8 @@ ret void } -; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop +; temporal-divergent use of value carried by divergent loop, user is inside +; sibling loop, defs and use are carried by a uniform loop define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 { ; CHECK-LABEL: for function 'temporal_diverge_loopuser_nested': ; CHECK-NOT: DIVERGENT: %uni.