Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -68,8 +68,9 @@ MCOperand decodeOperand_VReg_128(unsigned Val) const; MCOperand decodeOperand_SReg_32(unsigned Val) const; - MCOperand decodeOperand_SReg_32_XM0(unsigned Val) const; + MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const; MCOperand decodeOperand_SReg_64(unsigned Val) const; + MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const; MCOperand decodeOperand_SReg_128(unsigned Val) const; MCOperand decodeOperand_SReg_256(unsigned Val) const; MCOperand decodeOperand_SReg_512(unsigned Val) const; Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -81,8 +81,9 @@ DECODE_OPERAND(VReg_128) DECODE_OPERAND(SReg_32) -DECODE_OPERAND(SReg_32_XM0) +DECODE_OPERAND(SReg_32_XM0_XEXEC) DECODE_OPERAND(SReg_64) +DECODE_OPERAND(SReg_64_XEXEC) DECODE_OPERAND(SReg_128) DECODE_OPERAND(SReg_256) DECODE_OPERAND(SReg_512) @@ -277,13 +278,17 @@ return decodeSrcOp(OPW32, Val); } -MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0(unsigned Val) const { - // SReg_32_XM0 is SReg_32 without M0 +MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0_XEXEC( + unsigned Val) const { + // SReg_32_XM0 is SReg_32 without M0 or EXEC_LO/EXEC_HI return decodeOperand_SReg_32(Val); } MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const { - // see decodeOperand_SReg_32 comment + return decodeSrcOp(OPW64, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_64_XEXEC(unsigned Val) const { return decodeSrcOp(OPW64, Val); } Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -256,20 +256,25 @@ // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, - (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI, +def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, + (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> { let AllocationPriority = 1; } +def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, + (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { + let AllocationPriority = 1; +} + // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, - (add SReg_32_XM0, M0_CLASS)> { + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> { let AllocationPriority = 1; - let isAllocatable = 0; } def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { + let CopyCost = 1; let AllocationPriority = 2; } @@ -277,8 +282,15 @@ let isAllocatable = 0; } +def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, + (add SGPR_64, VCC, FLAT_SCR)> { + let CopyCost = 1; + let AllocationPriority = 2; +} + def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, - (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)> { + (add SReg_64_XEXEC, EXEC, TTMP_64, TBA, TMA)> { + let CopyCost = 1; let AllocationPriority = 2; } Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -126,7 +126,7 @@ } class SM_Time_Pseudo : SM_Pseudo< - opName, (outs SReg_64:$sdst), (ins), + opName, (outs SReg_64_XEXEC:$sdst), (ins), " $sdst", [(set i64:$sdst, (node))]> { let hasSideEffects = 1; // FIXME: mayStore = ? is a workaround for tablegen bug for different @@ -155,18 +155,23 @@ // We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit // SMRD instructions, because the SReg_32_XM0 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SM_Pseudo_Loads <"s_load_dword", SReg_64, SReg_32_XM0>; -defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_load_dwordx2", SReg_64, SReg_64>; + +// XXX - SMEM instructions do not allow exec for data operand, but +// does sdst for SMRD on SI/CI? +defm S_LOAD_DWORD : SM_Pseudo_Loads <"s_load_dword", SReg_64, SReg_32_XM0_XEXEC>; +defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_load_dwordx2", SReg_64, SReg_64_XEXEC>; defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>; defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>; defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>; defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads < - "s_buffer_load_dword", SReg_128, SReg_32_XM0 + "s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC >; +// FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on +// SI/CI, bit disallowed for SMEM on VI. defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads < - "s_buffer_load_dwordx2", SReg_128, SReg_64 + "s_buffer_load_dwordx2", SReg_128, SReg_64_XEXEC >; defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads < @@ -181,16 +186,16 @@ "s_buffer_load_dwordx16", SReg_128, SReg_512 >; -defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0>; -defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64>; +defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>; +defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>; defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>; defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores < - "s_buffer_store_dword", SReg_128, SReg_32_XM0 + "s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC >; defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores < - "s_buffer_store_dwordx2", SReg_128, SReg_64 + "s_buffer_store_dwordx2", SReg_128, SReg_64_XEXEC >; defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores < Index: test/CodeGen/AMDGPU/llvm.dbg.value.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -2,7 +2,11 @@ ; CHECK-LABEL: {{^}}test_debug_value: ; CHECK: s_load_dwordx2 s[4:5] -; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5 + +; FIXME: Why is the SGPR4_SGPR5 reference being removed from DBG_VALUE? +; CHECK: ; kill: %SGPR4_SGPR5 %SGPR4_SGPR5 +; CHECK-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- undef + ; CHECK: buffer_store_dword ; CHECK: s_endpgm define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -60,21 +60,22 @@ @lds = internal addrspace(3) global [64 x float] undef -; GCN-LABEL: {{^}}spill_m0_lds: +; m0 is killed, so it isn't necessary during the entry block spill to preserve it +; GCN-LABEL: {{^}}spill_kill_m0_lds: ; GCN: s_mov_b32 m0, s6 ; GCN: v_interp_mov_f32 -; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM-NOT: s_m0 ; TOSMEM: s_mov_b32 m0, s7 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill -; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM-NOT: m0 -; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s7, 0x100 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill ; TOSMEM: s_add_u32 m0, s7, 0x200 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill -; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b64 exec, ; TOSMEM: s_cbranch_execz @@ -88,35 +89,89 @@ ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 ; GCN-NOT: s_buffer_load_dword m0 -define amdgpu_ps void @spill_m0_lds(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) #0 { +define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) #0 { main_body: - %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - %cmp = fcmp ueq float 0.0, %4 + %tmp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3) + %cmp = fcmp ueq float 0.000000e+00, %tmp br i1 %cmp, label %if, label %else -if: +if: ; preds = %main_body %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0 %lds_data = load float, float addrspace(3)* %lds_ptr br label %endif -else: - %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) +else: ; preds = %main_body + %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3) + br label %endif + +endif: ; preds = %else, %if + %export = phi float [ %lds_data, %if ], [ %interp, %else ] + %tmp4 = call i32 @llvm.SI.packf16(float %export, float %export) + %tmp5 = bitcast i32 %tmp4 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp5, float %tmp5, float %tmp5, float %tmp5) + ret void +} + +; Force save and restore of m0 during SMEM spill +; GCN-LABEL: {{^}}m0_unavailable_spill: + +; GCN: ; def m0, 1 + +; GCN: s_mov_b32 m0, s2 +; GCN: v_interp_mov_f32 + +; GCN: ; clobber m0 + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill +; TOSMEM: s_add_u32 m0, s3, 0x100 +; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill +; TOSMEM: s_mov_b32 m0, vcc_hi + +; TOSMEM: s_mov_b64 exec, +; TOSMEM: s_cbranch_execz +; TOSMEM: s_branch + +; TOSMEM: BB{{[0-9]+_[0-9]+}}: +; TOSMEM-NEXT: s_mov_b32 m0, s3 +; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload +; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100 + +; FIXME: Could delay this wait +; TOSMEM-NEXT: s_waitcnt lgkmcnt(0) +; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload + + +; GCN-NOT: v_readlane_b32 m0 +; GCN-NOT: s_buffer_store_dword m0 +; GCN-NOT: s_buffer_load_dword m0 +define void @m0_unavailable_spill(i32 %arg3) #0 { +main_body: + %m0 = call i32 asm sideeffect "; def $0, 1", "={M0}"() #0 + %tmp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3) + call void asm sideeffect "; clobber $0", "~{M0}"() #0 + %cmp = fcmp ueq float 0.000000e+00, %tmp + br i1 %cmp, label %if, label %else + +if: ; preds = %main_body + store volatile i32 8, i32 addrspace(1)* undef + br label %endif + +else: ; preds = %main_body + store volatile i32 11, i32 addrspace(1)* undef br label %endif endif: - %export = phi float [%lds_data, %if], [%interp, %else] - %5 = call i32 @llvm.SI.packf16(float %export, float %export) - %6 = bitcast i32 %5 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6) ret void } ; GCN-LABEL: {{^}}restore_m0_lds: ; TOSMEM: s_cmp_eq_u32 -; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b32 m0, s3 ; TOSMEM: s_buffer_store_dword s4, s[84:87], m0 ; 4-byte Folded Spill -; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 ; TOSMEM: s_mov_b32 m0, -1 @@ -132,11 +187,12 @@ ; TOSMEM: ds_write_b64 -; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x200 ; TOSMEM: s_buffer_load_dword s0, s[84:87], m0 ; 4-byte Folded Reload -; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) +; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b32 m0, s0 ; TOSMEM: ; use m0 Index: test/MC/AMDGPU/smem-err.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/smem-err.s @@ -0,0 +1,55 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=NOVI %s + +s_memtime exec +// NOVI: :11: error: invalid operand for instruction + +s_memrealtime exec +// NOVI: :15: error: invalid operand for instruction + +s_store_dword m0, s[2:3], 0x0 +// NOVI: :15: error: invalid operand for instruction + +s_store_dword exec_lo, s[2:3], 0x0 +// NOVI: :15: error: invalid operand for instruction + +s_store_dword exec_hi, s[2:3], 0x0 +// NOVI: :15: error: invalid operand for instruction + +s_store_dwordx2 exec, s[2:3], 0x0 +// NOVI: :17: error: invalid operand for instruction + +s_buffer_store_dword m0, s[0:3], 0x0 +// NOVI: :22: error: invalid operand for instruction + +s_buffer_store_dword exec_lo, s[0:3], 0x0 +// NOVI: :22: error: invalid operand for instruction + +s_buffer_store_dword exec_hi, s[0:3], 0x0 +// NOVI: :22: error: invalid operand for instruction + +s_buffer_store_dwordx2 exec, s[0:3], 0x0 +// NOVI: :24: error: invalid operand for instruction + +s_load_dword m0, s[0:1], s4 +// NOVI: :14: error: invalid operand for instruction + +s_load_dword exec_lo, s[0:1], s4 +// NOVI: :14: error: invalid operand for instruction + +s_load_dword exec_hi, s[0:1], s4 +// NOVI: :14: error: invalid operand for instruction + +s_load_dwordx2 exec, s[0:1], s4 +// NOVI: :16: error: invalid operand for instruction + +s_buffer_load_dword m0, s[0:3], s4 +// NOVI: :21: error: invalid operand for instruction + +s_buffer_load_dword exec_lo, s[0:3], s4 +// NOVI: :21: error: invalid operand for instruction + +s_buffer_load_dword exec_hi, s[0:3], s4 +// NOVI: :21: error: invalid operand for instruction + +s_buffer_load_dwordx2 exec, s[0:3], s4 +// NOVI: :23: error: invalid operand for instruction