Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -391,7 +391,10 @@ case AMDGPU::FLAT_SCR: case AMDGPU::FLAT_SCR_LO: case AMDGPU::FLAT_SCR_HI: - FlatUsed = true; + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. + if (MFI->flatUsesScratch()) + FlatUsed = true; continue; case AMDGPU::TBA: Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -97,6 +97,7 @@ bool KernargSegmentPtr : 1; bool DispatchID : 1; bool FlatScratchInit : 1; + bool FlatUsesScratch : 1; bool GridWorkgroupCountX : 1; bool GridWorkgroupCountY : 1; bool GridWorkgroupCountZ : 1; @@ -207,6 +208,10 @@ return FlatScratchInit; } + bool flatUsesScratch() const { + return FlatUsesScratch; + } + bool hasGridWorkgroupCountX() const { return GridWorkgroupCountX; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -67,6 +67,7 @@ KernargSegmentPtr(false), DispatchID(false), FlatScratchInit(false), + FlatUsesScratch(false), GridWorkgroupCountX(false), GridWorkgroupCountY(false), GridWorkgroupCountZ(false), @@ -135,6 +136,8 @@ ST.isAmdHsaOS()) FlatScratchInit = true; + FlatUsesScratch = ST.getGeneration() >= SISubtarget::SEA_ISLANDS && + ST.isAmdHsaOS(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); WavesPerEU = ST.getWavesPerEU(*F); } Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -22,6 +22,7 @@ class SISubtarget; class MachineRegisterInfo; +class SIMachineFunctionInfo; class SIRegisterInfo final : public AMDGPURegisterInfo { private: @@ -196,7 +197,8 @@ unsigned getNumAddressableSGPRs(const SISubtarget &ST) const; /// \returns Number of reserved SGPRs supported by the subtarget. - unsigned getNumReservedSGPRs(const SISubtarget &ST) const; + unsigned getNumReservedSGPRs(const SISubtarget &ST, + const SIMachineFunctionInfo &MFI) const; /// \returns Minimum number of SGPRs that meets given number of waves per /// execution unit requirement for given subtarget. Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1114,9 +1114,16 @@ return 104; } -unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST) const { +unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST, + const SIMachineFunctionInfo &MFI) const { + // TODO: FLAT_SCRATCH isn't reserved on CI. + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + MFI.flatUsesScratch()) + return 6; // FLAT_SCRATCH, XNACK, VCC (in that order) + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 6; // VCC, FLAT_SCRATCH, XNACK. + return 4; // XNACK, VCC (in that order) + return 2; // VCC. } @@ -1188,7 +1195,7 @@ F, "amdgpu-num-sgpr", MaxNumSGPRs); // Make sure requested value does not violate subtarget's specifications. - if (Requested && (Requested <= getNumReservedSGPRs(ST))) + if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI))) Requested = 0; // If more SGPRs are required to support the input user/system SGPRs, @@ -1217,7 +1224,8 @@ if (ST.hasSGPRInitBug()) MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST), MaxNumAddressableSGPRs); + return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI), + MaxNumAddressableSGPRs); } unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs( Index: test/CodeGen/AMDGPU/exceed-max-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/exceed-max-sgprs.ll +++ test/CodeGen/AMDGPU/exceed-max-sgprs.ll @@ -38,7 +38,7 @@ ret void } -; ERROR: error: scalar registers limit of 104 exceeded (108) in use_too_many_sgprs_bonaire_flat_scr +; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr define void @use_too_many_sgprs_bonaire_flat_scr() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () Index: test/CodeGen/AMDGPU/flat-scratch-reg.ll =================================================================== --- test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=NOXNACK -check-prefix=CI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=NOXNACK -check-prefix=VI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=XNACK -check-prefix=VI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=XNACK -check-prefix=VI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=XNACK -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=XNACK -check-prefix=VI-XNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=NOXNACK -check-prefix=HSA-NOXNACK -check-prefix=HSA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=NOXNACK -check-prefix=HSA-NOXNACK -check-prefix=HSA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=XNACK -check-prefix=HSA-XNACK -check-prefix=HSA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=NOXNACK -check-prefix=HSA-NOXNACK -check-prefix=HSA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI -check-prefix=NOXNACK -check-prefix=HSA-NOXNACK -check-prefix=HSA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI -check-prefix=XNACK -check-prefix=HSA-XNACK -check-prefix=HSA -check-prefix=GCN %s ; GCN-LABEL: {{^}}no_vcc_no_flat: ; HSA-NOXNACK: is_xnack_enabled = 0 @@ -35,8 +35,11 @@ ; HSA-NOXNACK: is_xnack_enabled = 0 ; HSA-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 12 -; VI: ; NumSgprs: 14 +; CI: ; NumSgprs: 8 +; VI: ; NumSgprs: 8 +; VI-XNACK: ; NumSgprs: 12 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI: ; NumSgprs: 14 define void @no_vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"() @@ -47,8 +50,10 @@ ; HSA-NOXNACK: is_xnack_enabled = 0 ; HSA-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 12 -; VI: ; NumSgprs: 14 +; CI: ; NumSgprs: 10 +; VI: ; NumSgprs: 10 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI: ; NumSgprs: 14 define void @vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"() Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -7,7 +7,8 @@ ; XXX - Why does it like to use vcc? ; GCN-LABEL: {{^}}spill_m0: -; TOSMEM: s_mov_b32 s84, SCRATCH_RSRC_DWORD0 +; TOSMEM: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0 +; TOSMEM: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; GCN-DAG: s_cmp_lg_u32 @@ -22,7 +23,7 @@ ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 ; TOSMEM: s_mov_b32 m0, s3{{$}} ; TOSMEM-NOT: [[M0_COPY]] -; TOSMEM: s_buffer_store_dword [[M0_COPY]], s[84:87], m0 ; 4-byte Folded Spill +; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill ; TOSMEM: s_waitcnt lgkmcnt(0) ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] @@ -37,7 +38,7 @@ ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] ; TOSMEM: s_mov_b32 m0, s3{{$}} -; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload ; TOSMEM-NOT: [[M0_RESTORE]] ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]] @@ -115,7 +116,7 @@ ; TOSMEM: s_cmp_eq_u32 ; TOSMEM: s_mov_b32 vcc_hi, m0 ; TOSMEM: s_mov_b32 m0, s3 -; TOSMEM: s_buffer_store_dword s4, s[84:87], m0 ; 4-byte Folded Spill +; TOSMEM: s_buffer_store_dword s4, s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill ; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_cbranch_scc1 @@ -123,10 +124,10 @@ ; TOSMEM: s_mov_b32 vcc_hi, m0 ; TOSMEM: s_mov_b32 m0, s3 -; TOSMEM: s_buffer_load_dword s4, s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_buffer_load_dword s4, s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload ; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_waitcnt lgkmcnt(0) -; TOSMEM: s_buffer_load_dword s5, s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_buffer_load_dword s5, s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload ; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_waitcnt lgkmcnt(0) @@ -134,7 +135,7 @@ ; TOSMEM: s_mov_b32 vcc_hi, m0 ; TOSMEM: s_add_u32 m0, s3, 0x200 -; TOSMEM: s_buffer_load_dword s0, s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_buffer_load_dword s0, s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload ; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM: s_mov_b32 m0, s0