Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -458,13 +458,6 @@ def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; -def FeatureEnableHugePrivateBuffer : SubtargetFeature< - "huge-private-buffer", - "EnableHugePrivateBuffer", - "true", - "Enable private/scratch buffer sizes greater than 128 GB" ->; - def FeatureDumpCode : SubtargetFeature <"DumpCode", "DumpCode", "true", Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -299,7 +299,6 @@ bool TrapHandler; // Used as options. - bool EnableHugePrivateBuffer; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; @@ -377,6 +376,9 @@ SITargetLowering TLInfo; SIFrameLowering FrameLowering; + // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. + static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); + public: GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM); @@ -436,6 +438,11 @@ return Log2_32(WavefrontSize); } + /// Return the number of high bits known to be zero fror a frame index. + unsigned getKnownHighZeroBitsForFrameIndex() const { + return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); + } + int getLDSBankCount() const { return LDSBankCount; } @@ -526,10 +533,6 @@ return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } - bool enableHugePrivateBuffer() const { - return EnableHugePrivateBuffer; - } - bool unsafeDSOffsetFoldingEnabled() const { return EnableUnsafeDSOffsetFolding; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -190,7 +190,6 @@ EnableCuMode(false), TrapHandler(false), - EnableHugePrivateBuffer(false), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -93,12 +93,6 @@ cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); -static cl::opt AssumeFrameIndexHighZeroBits( - "amdgpu-frame-index-zero-bits", - cl::desc("High bits of frame index assumed to be zero"), - cl::init(5), - cl::ReallyHidden); - static cl::opt DisableLoopAlignment( "amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), @@ -2059,13 +2053,14 @@ Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) { + if (Arg.Flags.isSRet()) { // The return object should be reasonably addressable. // FIXME: This helps when the return is a real sret. If it is a // automatically inserted sret (i.e. CanLowerReturn returns false), an // extra copy is inserted in SelectionDAGBuilder which obscures this. - unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; + unsigned NumBits + = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); } @@ -9970,14 +9965,10 @@ TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, DAG, Depth); - if (getSubtarget()->enableHugePrivateBuffer()) - return; - - // Technically it may be possible to have a dispatch with a single workitem - // that uses the full private memory size, but that's not really useful. We - // can't use vaddr in MUBUF instructions if we don't know the address + // Set the high bits to zero based on the maximum allowed scratch size per + // wave. We can't use vaddr in MUBUF instructions if we don't know the address // calculation won't overflow, so assume the sign bit is never set. - Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); + Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -60,7 +60,7 @@ ; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] -; GCN-NEXT: v_mul_lo_u32 v0, v0, 9 +; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_other_fi_user_i32() #0 { @@ -172,7 +172,7 @@ ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], s6, [[SCALED]] -; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9 +; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { %alloca0 = alloca [128 x i32], align 4, addrspace(5) @@ -196,7 +196,7 @@ ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]] ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[OFFSET]], [[SCALED]] -; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9 +; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 { %alloca0 = alloca [128 x i32], align 4, addrspace(5) Index: test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- test/CodeGen/AMDGPU/function-returns.ll +++ test/CodeGen/AMDGPU/function-returns.ll @@ -570,4 +570,24 @@ ret { <3 x float>, i32 } %insert.4 } +; GCN-LABEL: {{^}}void_func_sret_max_known_zero_bits: +; GCN: v_lshrrev_b32_e32 [[LSHR16:v[0-9]+]], 16, v0 +; GCN: ds_write_b32 {{v[0-9]+}}, [[LSHR16]] + +; GCN: v_mov_b32_e32 [[HIGH_BITS:v[0-9]+]], 0 +; GCN: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]] +; GCN-NEXT: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]] +define void @void_func_sret_max_known_zero_bits(i8 addrspace(5)* sret %arg0) #0 { + %arg0.int = ptrtoint i8 addrspace(5)* %arg0 to i32 + + %lshr0 = lshr i32 %arg0.int, 16 + %lshr1 = lshr i32 %arg0.int, 17 + %lshr2 = lshr i32 %arg0.int, 18 + + store volatile i32 %lshr0, i32 addrspace(3)* undef + store volatile i32 %lshr1, i32 addrspace(3)* undef + store volatile i32 %lshr2, i32 addrspace(3)* undef + ret void +} + attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/huge-private-buffer.ll =================================================================== --- test/CodeGen/AMDGPU/huge-private-buffer.ll +++ test/CodeGen/AMDGPU/huge-private-buffer.ll @@ -1,31 +1,42 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_small: +; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16: +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] +define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + %toint = ptrtoint i32 addrspace(5)* %alloca to i32 + %masked = and i32 %toint, 65535 + store volatile i32 %masked, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 ; GCN-NOT: [[FI]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] -define amdgpu_kernel void @scratch_buffer_known_high_bit_small() #0 { +define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca %toint = ptrtoint i32 addrspace(5)* %alloca to i32 - %masked = and i32 %toint, 2147483647 + %masked = and i32 %toint, 131071 store volatile i32 %masked, i32 addrspace(1)* undef ret void } -; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_huge: +; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 -; GCN-DAG: buffer_store_dword -; GCN-DAG: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x7ffffffc, [[FI]] -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] -define amdgpu_kernel void @scratch_buffer_known_high_bit_huge() #1 { +; GCN-NOT: [[FI]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] +define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca %toint = ptrtoint i32 addrspace(5)* %alloca to i32 - %masked = and i32 %toint, 2147483647 + %masked = and i32 %toint, 262143 store volatile i32 %masked, i32 addrspace(1)* undef ret void } attributes #0 = { nounwind } -attributes #1 = { nounwind "target-features"="+huge-private-buffer" }