Index: llvm/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -865,6 +865,11 @@ llvm_unreachable("Invalid TargetStackID::Value"); } +static bool isWave32(const GCNSubtarget &ST) { + return (ST.getGeneration() >= AMDGPUSubtarget::GFX10) && + ST.hasFeature(AMDGPU::FeatureWavefrontSize32); +} + // Activate only the inactive lanes when \p EnableInactiveLanes is true. // Otherwise, activate all lanes. It returns the saved exec. static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, @@ -882,18 +887,19 @@ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); - ScratchExecCopy = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + const TargetRegisterClass *TRC = + isWave32(ST) ? TRI.getWaveMaskRegClass() : &AMDGPU::SReg_64RegClass; + ScratchExecCopy = findScratchNonCalleeSaveRegister(MRI, LiveRegs, *TRC); if (!ScratchExecCopy) report_fatal_error("failed to find free scratch register"); LiveRegs.addReg(ScratchExecCopy); const unsigned SaveExecOpc = - ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 - : AMDGPU::S_OR_SAVEEXEC_B32) - : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 - : AMDGPU::S_OR_SAVEEXEC_B64); + isWave32(ST) ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B32) + : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 + : AMDGPU::S_OR_SAVEEXEC_B64); auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1); SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. @@ -934,8 +940,8 @@ StoreWWMRegisters(WWMScratchRegs); if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned MovOpc = isWave32(ST) ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = isWave32(ST) ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); } else { ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, @@ -947,8 +953,8 @@ StoreWWMRegisters(WWMCalleeSavedRegs); if (ScratchExecCopy) { // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned ExecMov = isWave32(ST) ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = isWave32(ST) ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) .addReg(ScratchExecCopy, RegState::Kill); LiveRegs.addReg(ScratchExecCopy); @@ -1039,8 +1045,8 @@ RestoreWWMRegisters(WWMScratchRegs); if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned MovOpc = isWave32(ST) ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = isWave32(ST) ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); } else { ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, @@ -1052,8 +1058,8 @@ RestoreWWMRegisters(WWMCalleeSavedRegs); if (ScratchExecCopy) { // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned ExecMov = isWave32(ST) ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = isWave32(ST) ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) .addReg(ScratchExecCopy, RegState::Kill); } Index: llvm/test/CodeGen/AMDGPU/frame-lowering-exec.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/frame-lowering-exec.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define <2 x i64> @f1() #0 { +; GFX9-LABEL: f1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + ret <2 x i64> zeroinitializer +} + +define void @f0() { +; GFX9-LABEL: f0: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, f1@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, f1@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: v_writelane_b32 v4, s30, 0 +; GFX9-NEXT: v_writelane_b32 v4, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_readlane_b32 s31, v4, 1 +; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f0: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, f1@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, f1@gotpcrel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v4, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: v_writelane_b32 v4, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: v_readlane_b32 s31, v4, 1 +; GFX10-NEXT: v_readlane_b32 s30, v4, 0 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %i = call <2 x i64> @f1() + ret void +} + +attributes #0 = { noinline optnone }