diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5282,7 +5282,7 @@ .addReg(Exec) .addReg(SaveExec); - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); } // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -312,6 +312,14 @@ let hasSideEffects = 1; } +def SI_WATERFALL_LOOP : CFPseudoInstSI < + (outs), + (ins brtarget:$target), [], 1> { + let Size = 8; + let isBranch = 1; + let Defs = []; +} + def SI_LOOP : CFPseudoInstSI < (outs), (ins SReg_1:$saved, brtarget:$target), [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -91,6 +91,7 @@ void emitElse(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); void emitLoop(MachineInstr &MI); + void emitWaterfallLoop(MachineInstr &MI); MachineBasicBlock *emitEndCf(MachineInstr &MI); @@ -418,6 +419,22 @@ MI.eraseFromParent(); } +void SILowerControlFlow::emitWaterfallLoop(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + MachineInstr *Branch = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .add(MI.getOperand(0)); + + if (LIS) { + LIS->ReplaceMachineInstrInMaps(MI, *Branch); + LIS->InsertMachineInstrInMaps(*Branch); + } + + MI.eraseFromParent(); +} + MachineBasicBlock::iterator SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { @@ -600,6 +617,10 @@ emitLoop(MI); break; + case AMDGPU::SI_WATERFALL_LOOP: + emitWaterfallLoop(MI); + break; + case AMDGPU::SI_END_CF: SplitBB = emitEndCf(MI); break; @@ -843,6 +864,7 @@ case AMDGPU::SI_ELSE: case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_WATERFALL_LOOP: case AMDGPU::SI_LOOP: case AMDGPU::SI_END_CF: // Only build worklist if SI_IF instructions must be processed first. diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -171,6 +171,7 @@ if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE || + MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP || MI.getOpcode() == AMDGPU::SI_LOOP) { Divergent = true; break; diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -30,7 +30,7 @@ # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec # W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec # W64-LABEL: bb.2: # W64: $exec = S_MOV_B64 [[SAVEEXEC]] @@ -55,7 +55,7 @@ # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec # W32-LABEL: bb.2: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- @@ -103,7 +103,7 @@ # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec # W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec # W64-LABEL: bb.2: # W64: $exec = S_MOV_B64 [[SAVEEXEC]] @@ -128,7 +128,7 @@ # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec # W32-LABEL: bb.2: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- @@ -176,7 +176,7 @@ # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec # W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec # W64-LABEL: bb.2: # W64: $exec = S_MOV_B64 [[SAVEEXEC]] @@ -201,7 +201,7 @@ # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec # W32-LABEL: bb.2: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- @@ -286,7 +286,7 @@ # W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec # W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64-NO-ADDR64: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W64-NO-ADDR64: SI_WATERFALL_LOOP %bb.1, implicit $exec # W64-NO-ADDR64-LABEL: bb.2: # W64-NO-ADDR64: $exec = S_MOV_B64 [[SAVEEXEC]] @@ -309,7 +309,7 @@ # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec # W32-LABEL: bb.2: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]]