Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -124,7 +124,9 @@ // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "SGPR%u", 0, 103))>; + (add (sequence "SGPR%u", 0, 103))> { + let AllocationPriority = 1; +} // SGPR 64-bit registers def SGPR_64Regs : RegisterTuples<[sub0, sub1], @@ -189,7 +191,9 @@ // VGPR 32-bit registers def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "VGPR%u", 0, 255))>; + (add (sequence "VGPR%u", 0, 255))> { + let AllocationPriority = 1; +} // VGPR 64-bit registers def VGPR_64 : RegisterTuples<[sub0, sub1], @@ -253,16 +257,22 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI) ->; +> { + let AllocationPriority = 1; +} // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI) ->; +> { + let AllocationPriority = 1; +} -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>; +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { + let AllocationPriority = 2; +} def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> { let isAllocatable = 0; @@ -270,35 +280,44 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA) ->; +> { + let AllocationPriority = 2; +} // Requires 2 s_mov_b64 to copy let CopyCost = 2 in { -def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)>; +def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> { + let AllocationPriority = 4; +} def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> { let isAllocatable = 0; } -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { + let AllocationPriority = 4; +} } // End CopyCost = 2 def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; + let AllocationPriority = 5; } def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; + let AllocationPriority = 6; } // Register class for all vector registers (VGPRs + Interploation Registers) def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { // Requires 2 v_mov_b32 to copy let CopyCost = 2; + let AllocationPriority = 2; } def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { @@ -306,19 +325,23 @@ // Requires 3 v_mov_b32 to copy let CopyCost = 3; + let AllocationPriority = 3; } def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { // Requires 4 v_mov_b32 to copy let CopyCost = 4; + let AllocationPriority = 4; } def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> { let CopyCost = 8; + let AllocationPriority = 5; } def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { let CopyCost = 16; + let AllocationPriority = 6; } def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { Index: test/CodeGen/AMDGPU/amdgcn.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgcn.private-memory.ll +++ test/CodeGen/AMDGPU/amdgcn.private-memory.ll @@ -1,9 +1,9 @@ -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s +; RUN: llc -mattr=+promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE -check-prefix=HSA %s +; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s +; RUN: llc -mattr=-promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA -check-prefix=HSA %s +; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s +; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -11,9 +11,10 @@ ; Make sure we don't overwrite workitem information with private memory -; FUNC-LABEL: {{^}}work_item_info: - -; SI-NOT: v_mov_b32_e{{(32|64)}} v0 +; GCN-LABEL: {{^}}work_item_info: +; GCN-NOT: v0 +; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}} +; GCN: buffer_store_dword [[RESULT]] define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { entry: %0 = alloca [2 x i32] Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -398,9 +398,8 @@ ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN-NOT: v_cvt_f32_f16 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 ; GCN: v_cvt_f64_f32_e32 ; GCN: v_cvt_f64_f32_e32 Index: test/CodeGen/AMDGPU/load.ll =================================================================== --- test/CodeGen/AMDGPU/load.ll +++ test/CodeGen/AMDGPU/load.ll @@ -725,7 +725,7 @@ ; an immediate. ; FUNC-LABEL: {{^}}load_i32_local_const_ptr: ; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 -; SI: ds_read_b32 v0, v[[ZERO]] offset:4 +; SI: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4 ; R600: LDS_READ_RET define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { entry: Index: test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -6,12 +6,13 @@ ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: -; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} ; GCN-NOT: v_mov_b32 -; GCN-NEXT: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] ; GCN-NOT: v_mov_b32 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -51,14 +51,16 @@ } ; Test moving an SMRD instruction to the VALU +; FIXME: movs can be moved before nop to reduce count ; GCN-LABEL: {{^}}smrd_valu: ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} -; SI: s_mov_b32 -; SI: s_nop 2 +; SI: s_nop 3 ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]] +; SI: s_mov_b32 + ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]] ; GCN-NOHSA: buffer_store_dword [[V_OUT]] Index: test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll +++ test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s ; FUNC-LABEL: {{^}}cluster_arg_loads: ; FIXME: Due to changes in the load clustering heuristics. We no longer @@ -9,9 +9,9 @@ ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 ; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 -; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 +; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { store i32 %x, i32 addrspace(1)* %out0, align 4 store i32 %y, i32 addrspace(1)* %out1, align 4 Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -12,16 +12,16 @@ ; GCN-LABEL: {{^}}main: -; GCN-DAG: s_mov_b32 s6, s12 +; GCN-DAG: s_mov_b32 s13, s12 ; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 ; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN-DAG: s_mov_b32 s18, -1 ; SI-DAG: s_mov_b32 s19, 0x88f000 ; VI-DAG: s_mov_b32 s19, 0x880000 -; s6 is offset system SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s6 offset:{{[0-9]+}} ; 16-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s6 offset:{{[0-9]+}} ; 16-byte Folded Reload +; s13 is offset system SGPR +; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 Index: test/CodeGen/AMDGPU/waitcnt-flat.ll =================================================================== --- test/CodeGen/AMDGPU/waitcnt-flat.ll +++ test/CodeGen/AMDGPU/waitcnt-flat.ll @@ -6,9 +6,9 @@ ; for the original bug. ; GCN: {{^}}test: -; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]] -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}] +; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]] +; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0) +; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}] define void @test(i32 addrspace(1)* %out, i32 %in) { store volatile i32 0, i32 addrspace(1)* %out %val = load volatile i32, i32 addrspace(1)* %out Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -310,16 +310,16 @@ ; ... but only if WQM is necessary. ; -;CHECK-LABEL: {{^}}test_kill_1: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: image_sample -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;SI: buffer_store_dword -;VI: flat_store_dword -;CHECK-NOT: wqm -;CHECK: v_cmpx_ +; CHECK-LABEL: {{^}}test_kill_1: +; CHECK-NEXT: ; %main_body +; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: image_sample +; CHECK: s_and_b64 exec, exec, [[ORIG]] +; SI: buffer_store_dword +; VI: flat_store_dword +; CHECK-NOT: wqm +; CHECK: v_cmpx_ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)