Index: llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -77,6 +77,32 @@ return false; } +// See if there is a def between \p AndIdx and \p SelIdx that needs to live +// beyond \p AndIdx. +static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx, + SlotIndex SelIdx) { + LiveQueryResult AndLRQ = LR.Query(AndIdx); + return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut()); +} + +// FIXME: Why do we bother trying to handle physical registers here? +static bool isDefBetween(const SIRegisterInfo &TRI, + LiveIntervals *LIS, Register Reg, + const MachineInstr &Sel, const MachineInstr &And) { + SlotIndex AndIdx = LIS->getInstructionIndex(And); + SlotIndex SelIdx = LIS->getInstructionIndex(Sel); + + if (Reg.isVirtual()) + return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx); + + for (MCRegUnitIterator UI(Reg, &TRI); UI.isValid(); ++UI) { + if (isDefBetween(LIS->getRegUnit(*UI), AndIdx, SelIdx)) + return true; + } + + return false; +} + // Optimize sequence // %sel = V_CNDMASK_B32_e64 0, 1, %cc // %cmp = V_CMP_NE_U32 1, %1 @@ -158,10 +184,16 @@ Op1->getImm() != 0 || Op2->getImm() != 1) return AMDGPU::NoRegister; + Register CCReg = CC->getReg(); + + // If there was a def between the select and the and, we would need to move it + // to fold this. + if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And)) + return AMDGPU::NoRegister; + LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t' << *And); - Register CCReg = CC->getReg(); LIS->RemoveMachineInstrFromMaps(*And); MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc), Index: llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir @@ -0,0 +1,201 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=si-optimize-exec-masking-pre-ra,si-optimize-exec-masking-pre-ra -o - %s | FileCheck %s + +# FIXME: Second run of the pass is a workaround for a bug in +# -run-pass. The verifier doesn't detect broken LiveIntervals, see bug +# 46873 + + +# Cannot fold this without moving the def of %7 after the and. +--- +name: no_fold_andn2_select_condition_live_out_phi +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: no_fold_andn2_select_condition_live_out_phi + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; CHECK: undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec + ; CHECK: V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec + ; CHECK: %1.sub1:vreg_64 = COPY %1.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; CHECK: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + %7:sreg_64_xexec = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %7, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + %7:sreg_64_xexec = S_MOV_B64 0 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... + +# It's OK to fold this, since the phi def is after the andn2 insert point. +--- +name: fold_andn2_select_condition_live_out_phi_reorder +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: fold_andn2_select_condition_live_out_phi_reorder + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; CHECK: undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: %1.sub1:vreg_64 = COPY %1.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: $vcc = S_ANDN2_B64 $exec, [[S_MOV_B64_]], implicit-def dead $scc + ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + %7:sreg_64_xexec = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %7, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + %7:sreg_64_xexec = S_MOV_B64 0 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... + +--- +name: no_fold_andn2_select_condition_live_out_phi_physreg +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: no_fold_andn2_select_condition_live_out_phi_physreg + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 -1 + ; CHECK: undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec + ; CHECK: V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec + ; CHECK: %0.sub1:vreg_64 = COPY %0.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + $sgpr4_sgpr5 = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + liveins: $sgpr4_sgpr5 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + $sgpr4_sgpr5 = S_MOV_B64 0 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... + +--- +name: fold_andn2_select_condition_live_out_phi_physreg_reorder +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: fold_andn2_select_condition_live_out_phi_physreg_reorder + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 -1 + ; CHECK: undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK: %0.sub1:vreg_64 = COPY %0.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: $vcc = S_ANDN2_B64 $exec, $sgpr4_sgpr5, implicit-def dead $scc + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + $sgpr4_sgpr5 = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + liveins: $sgpr4_sgpr5 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + $sgpr4_sgpr5 = S_MOV_B64 0 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +...