diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2142,6 +2142,12 @@ MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } + case AMDGPU::ENTER_PSEUDO_WM: + case AMDGPU::EXIT_PSEUDO_WM: { + // These do nothing. + MI.eraseFromParent(); + break; + } case AMDGPU::SI_RETURN: { const MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -188,6 +188,21 @@ let mayStore = 0; } +// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes. +def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { + let Uses = [EXEC]; + let Defs = [EXEC]; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + +def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + // Pseudo instructions used for @llvm.fptrunc.round upward // and @llvm.fptrunc.round downward. // These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -163,15 +163,19 @@ unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) { + if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM || + Opc == AMDGPU::ENTER_PSEUDO_WM) { dbgs() << "Entering "; } else { - assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM); + assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM || + Opc == AMDGPU::EXIT_PSEUDO_WM); dbgs() << "Exiting "; } if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) { dbgs() << "Strict WWM "; + } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) { + dbgs() << "Pseudo WWM/WQM "; } else { assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM); dbgs() << "Strict WQM "; @@ -214,14 +218,16 @@ RegsAssigned |= processDef(MI.getOperand(0)); if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM || - MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) { + MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM || + MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) { LLVM_DEBUG(printWWMInfo(MI)); InWWM = true; continue; } if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM || - MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) { + MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM || + MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) { LLVM_DEBUG(printWWMInfo(MI)); InWWM = false; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -215,6 +215,8 @@ MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, bool IsWQM); MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); + void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry, + MachineInstr *Exit); void lowerBlock(MachineBasicBlock &MBB); void processBlock(MachineBasicBlock &MBB, bool IsEntry); @@ -1040,6 +1042,31 @@ return NewTerm; } +// Convert a strict mode transition to a pseudo transition. +// This still pre-allocates registers to prevent clobbering, +// but avoids any EXEC mask changes. +void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB, + MachineInstr *Entry, + MachineInstr *Exit) { + assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM); + assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM); + + Register SaveOrig = Entry->getOperand(0).getReg(); + + MachineInstr *NewEntry = + BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM)); + MachineInstr *NewExit = + BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM)); + + LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit); + Exit->eraseFromParent(); + + LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry); + Entry->eraseFromParent(); + + LIS->removeInterval(SaveOrig); +} + // Replace (or supplement) instructions accessing live mask. // This can only happen once all the live mask registers have been created // and the execute state (WQM/StrictWWM/Exact) of instructions is known. @@ -1056,9 +1083,12 @@ SmallVector SplitPoints; char State = BI.InitialState; + MachineInstr *StrictEntry = nullptr; for (MachineInstr &MI : llvm::make_early_inc_range( llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) { + char PreviousState = State; + if (StateTransition.count(&MI)) State = StateTransition[&MI]; @@ -1071,6 +1101,20 @@ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: SplitPoint = lowerKillF32(MBB, MI); break; + case AMDGPU::ENTER_STRICT_WQM: + StrictEntry = PreviousState == StateWQM ? &MI : nullptr; + break; + case AMDGPU::EXIT_STRICT_WQM: + if (State == StateWQM && StrictEntry) { + // Transition WQM -> StrictWQM -> WQM detected. + lowerPseudoStrictMode(MBB, StrictEntry, &MI); + } + StrictEntry = nullptr; + break; + case AMDGPU::ENTER_STRICT_WWM: + case AMDGPU::EXIT_STRICT_WWM: + StrictEntry = nullptr; + break; default: break; } @@ -1213,7 +1257,12 @@ .addImm(-1); } LIS->InsertMachineInstrInMaps(*MI); - StateTransition[MI] = StateStrictWWM; + StateTransition[MI] = StrictStateNeeded; + + // Mark block as needing lower so it will be checked for unnecessary transitions. + auto BII = Blocks.find(&MBB); + if (BII != Blocks.end()) + BII->second.NeedsLowering = true; } void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -2820,24 +2820,18 @@ ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec -; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec -; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF -; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec -; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) -; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -2850,24 +2844,18 @@ ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo ; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF -; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo -; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -3150,10 +3138,8 @@ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen -; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec -; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_nop 0 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen -; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(1) ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 @@ -3194,11 +3180,9 @@ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_clause 0x1 ; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen -; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo -; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1) ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)