Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -186,14 +186,6 @@ def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; - -def FeatureEnableHugeScratchBuffer : SubtargetFeature< - "huge-scratch-buffer", - "EnableHugeScratchBuffer", - "true", - "Enable scratch buffer sizes greater than 128 GB" ->; - def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -91,7 +91,6 @@ bool FeatureDisable; int LDSBankCount; unsigned IsaVersion; - bool EnableHugeScratchBuffer; bool EnableSIScheduler; std::unique_ptr FrameLowering; @@ -283,10 +282,6 @@ return false; } - bool enableHugeScratchBuffer() const { - return EnableHugeScratchBuffer; - } - bool enableSIScheduler() const { return EnableSIScheduler; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -82,7 +82,7 @@ LocalMemorySize(0), MaxPrivateElementSize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), - IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), + IsaVersion(ISAVersion0_0_0), EnableSIScheduler(false), FrameLowering(nullptr), InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1175,25 +1175,35 @@ FrameIndexSDNode *FINode = cast(Op); unsigned FrameIndex = FINode->getIndex(); - // A FrameIndex node represents a 32-bit offset into scratch memory. If - // the high bit of a frame index offset were to be set, this would mean - // that it represented an offset of ~2GB * 64 = ~128GB from the start of the - // scratch buffer, with 64 being the number of threads per wave. + // A FrameIndex node represents a 32-bit offset into scratch memory. If the + // high bit of a frame index offset were to be set, this would mean that it + // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch + // buffer, with 64 being the number of threads per wave. // - // If we know the machine uses less than 128GB of scratch, then we can - // amrk the high bit of the FrameIndex node as known zero, - // which is important, because it means in most situations we can - // prove that values derived from FrameIndex nodes are non-negative. - // This enables us to take advantage of more addressing modes when - // accessing scratch buffers, since for scratch reads/writes, the register - // offset must always be positive. + // The maximum private allocation for the entire GPU is 4G, and we are + // concerned with the largest the index could ever be for an individual + // workitem. This will occur with the minmum dispatch size. If a program + // requires more, the dispatch size will be reduced. + // + // With this limit, we can mark the high bit of the FrameIndex node as known + // zero, which is important, because it means in most situations we can prove + // that values derived from FrameIndex nodes are non-negative. This enables us + // to take advantage of more addressing modes when accessing scratch buffers, + // since for scratch reads/writes, the register offset must always be + // positive. - SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); - if (Subtarget->enableHugeScratchBuffer()) - return TFI; + uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; + + // XXX - It is unclear if partial dispatch works. Assume it works at half wave + // granularity. It is probably a full wave. + uint64_t MinGranularity = 32; + unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); + + SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, - DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); + DAG.getValueType(ExtVT)); } bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { Index: test/CodeGen/AMDGPU/private-element-size.ll =================================================================== --- test/CodeGen/AMDGPU/private-element-size.ll +++ test/CodeGen/AMDGPU/private-element-size.ll @@ -33,9 +33,9 @@ ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -99,10 +99,14 @@ ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}} define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/scratch-buffer.ll =================================================================== --- test/CodeGen/AMDGPU/scratch-buffer.ll +++ test/CodeGen/AMDGPU/scratch-buffer.ll @@ -1,7 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s +; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s ; When a frame index offset is more than 12-bits, make sure we don't store ; it in mubuf's offset field. @@ -102,8 +100,7 @@ } ; GCN-LABEL: @pos_vaddr_offse -; DEFAULT-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16 -; HUGE-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16 define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { entry: %array = alloca [8192 x i32]