diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -382,6 +382,19 @@ /// to which instructions should be sunk. virtual bool shouldSink(const MachineInstr &MI) const { return true; } + /// Return pointer to the appropriate position for inserting instructions + /// to be sunk into a given basic block. + /// + /// Override this if the target has instructions which must remain at the + /// beginning of a block. + virtual MachineBasicBlock::iterator + sinkPosition(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator InsertPos = MBB.begin(); + while (InsertPos != MBB.end() && InsertPos->isPHI()) + ++InsertPos; + return InsertPos; + } + /// Return false if the instruction should not be hoisted by MachineLICM. /// /// MachineLICM determines on its own whether the instruction is safe to diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -1392,10 +1392,8 @@ return false; } - // Determine where to insert into. Skip phi nodes. - MachineBasicBlock::iterator InsertPos = SuccToSinkTo->begin(); - while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI()) - ++InsertPos; + // Determine where to insert into. + MachineBasicBlock::iterator InsertPos = TII->sinkPosition(*SuccToSinkTo); // Collect debug users of any vreg that this inst defines. SmallVector DbgUsersToSink; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -188,6 +188,9 @@ bool isIgnorableUse(const MachineOperand &MO) const override; + MachineBasicBlock::iterator + sinkPosition(MachineBasicBlock &MBB) const override; + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -172,6 +172,17 @@ isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); } +MachineBasicBlock::iterator +SIInstrInfo::sinkPosition(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator InsertPos = TargetInstrInfo::sinkPosition(MBB); + // Insertion point must be after any control flow instructions which modify + // EXEC + while (InsertPos != MBB.end() && !InsertPos->isTerminator() && + InsertPos->modifiesRegister(AMDGPU::EXEC, &RI)) + InsertPos++; + return InsertPos; +} + bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const { diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir @@ -0,0 +1,185 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX10 %s + +--- +name: _amdgpu_hs_main +alignment: 1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX10-LABEL: name: _amdgpu_hs_main + ; GFX10: bb.0: + ; GFX10-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX10-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit $exec + ; GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255 + ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[DEF]], killed [[S_MOV_B32_2]], implicit-def dead $scc + ; GFX10-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], killed [[S_AND_B32_]], implicit $exec + ; GFX10-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_GE_U32_e64_]], -1, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_]], $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_AND_B32_1]], implicit-def $scc + ; GFX10-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_1]] + ; GFX10-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX10-NEXT: S_BRANCH %bb.1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: bb.1: + ; GFX10-NEXT: successors: %bb.2(0x80000000) + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: S_BRANCH %bb.2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: bb.2: + ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.8(0x40000000) + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; GFX10-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[DEF]], 524296, implicit-def dead $scc + ; GFX10-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], killed [[S_BFE_U32_]], implicit $exec + ; GFX10-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_GE_U32_e64_1]], -1, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_2]], $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_AND_B32_2]], implicit-def $scc + ; GFX10-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_2]] + ; GFX10-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec + ; GFX10-NEXT: S_BRANCH %bb.3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: bb.3: + ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX10-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 8 + ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[DEF1]], 8, 5, implicit $exec + ; GFX10-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; GFX10-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_4]], implicit $exec + ; GFX10-NEXT: [[S_XOR_B32_4:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], -1, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_4]], $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_XOR_B32_5:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_AND_B32_3]], implicit-def $scc + ; GFX10-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_3]] + ; GFX10-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GFX10-NEXT: S_BRANCH %bb.4 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: bb.4: + ; GFX10-NEXT: successors: %bb.5(0x80000000) + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: S_BRANCH %bb.5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: bb.5: + ; GFX10-NEXT: successors: %bb.6(0x40000000), %bb.7(0x40000000) + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_5]], implicit-def $scc + ; GFX10-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_3]], [[DEF1]], implicit $exec + ; GFX10-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31 + ; GFX10-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_5]], implicit $exec + ; GFX10-NEXT: [[S_XOR_B32_6:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], -1, implicit-def $scc + ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_6]], $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_XOR_B32_7:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_AND_B32_4]], implicit-def $scc + ; GFX10-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_4]] + ; GFX10-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec + ; GFX10-NEXT: S_BRANCH %bb.6 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: bb.6: + ; GFX10-NEXT: successors: %bb.7(0x80000000) + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: S_BRANCH %bb.7 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: bb.7: + ; GFX10-NEXT: successors: %bb.8(0x80000000) + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_7]], implicit-def $scc + ; GFX10-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GFX10-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 4 + ; GFX10-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHL_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[S_MOV_B32_7]], killed [[S_MOV_B32_6]], implicit $exec + ; GFX10-NEXT: S_BRANCH %bb.8 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: bb.8: + ; GFX10-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_3]], implicit-def $scc + ; GFX10-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + + %0:sgpr_32 = IMPLICIT_DEF + %1:sreg_32 = S_MOV_B32 0 + %2:sreg_32 = S_MOV_B32 -1 + %3:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed %2, %1, implicit $exec + %4:sreg_32 = S_MOV_B32 255 + %5:sreg_32 = S_AND_B32 %0, killed %4, implicit-def dead $scc + %6:sreg_32 = V_CMP_GE_U32_e64 %3, killed %5, implicit $exec + %7:sreg_32 = S_XOR_B32 %6, -1, implicit-def $scc + %8:sreg_32 = S_AND_B32 %7, $exec_lo, implicit-def $scc + %9:sreg_32 = S_XOR_B32 $exec_lo, %8, implicit-def $scc + $exec_lo = S_MOV_B32_term %8 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3(0x40000000), %bb.8(0x40000000) + + $exec_lo = S_OR_B32 $exec_lo, %9, implicit-def $scc + %10:sreg_32 = S_BFE_U32 %0, 524296, implicit-def dead $scc + %11:sreg_32 = V_CMP_GE_U32_e64 %3, killed %10, implicit $exec + %12:sreg_32 = S_XOR_B32 %11, -1, implicit-def $scc + %13:sreg_32 = S_AND_B32 %12, $exec_lo, implicit-def $scc + %14:sreg_32 = S_XOR_B32 $exec_lo, %13, implicit-def $scc + $exec_lo = S_MOV_B32_term %13 + S_CBRANCH_EXECZ %bb.8, implicit $exec + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4(0x40000000), %bb.5(0x40000000) + + %15:vgpr_32 = IMPLICIT_DEF + %16:sreg_32 = S_MOV_B32 8 + %17:vgpr_32 = V_LSHRREV_B32_e64 %16, %15, implicit $exec + %18:vgpr_32 = V_BFE_U32_e64 %15, 8, 5, implicit $exec + %19:sreg_32 = S_MOV_B32 5 + %20:sreg_32 = V_CMP_NE_U32_e64 %18, killed %19, implicit $exec + %21:sreg_32 = S_XOR_B32 %20, -1, implicit-def $scc + %22:sreg_32 = S_AND_B32 %21, $exec_lo, implicit-def $scc + %23:sreg_32 = S_XOR_B32 $exec_lo, %22, implicit-def $scc + $exec_lo = S_MOV_B32_term %22 + S_CBRANCH_EXECZ %bb.5, implicit $exec + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5(0x80000000) + + S_BRANCH %bb.5 + + bb.5: + successors: %bb.6(0x40000000), %bb.7(0x40000000) + + $exec_lo = S_OR_B32 $exec_lo, %23, implicit-def $scc + %24:sreg_32 = S_MOV_B32 31 + %25:sreg_32 = V_CMP_NE_U32_e64 %18, killed %24, implicit $exec + %26:sreg_32 = S_XOR_B32 %25, -1, implicit-def $scc + %27:sreg_32 = S_AND_B32 %26, $exec_lo, implicit-def $scc + %28:sreg_32 = S_XOR_B32 $exec_lo, %27, implicit-def $scc + $exec_lo = S_MOV_B32_term %27 + S_CBRANCH_EXECZ %bb.7, implicit $exec + S_BRANCH %bb.6 + + bb.6: + successors: %bb.7(0x80000000) + + S_BRANCH %bb.7 + + bb.7: + successors: %bb.8(0x80000000) + + $exec_lo = S_OR_B32 $exec_lo, %28, implicit-def $scc + %29:sreg_32 = S_MOV_B32 16 + %30:sreg_32 = S_MOV_B32 4 + %31:vgpr_32 = nuw nsw V_LSHL_ADD_U32_e64 %17, %30, killed %29, implicit $exec + S_BRANCH %bb.8 + + bb.8: + $exec_lo = S_OR_B32 $exec_lo, %14, implicit-def $scc + S_ENDPGM 0 + +...