Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -346,6 +346,13 @@ def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; +def FeatureEnableHugePrivateBuffer : SubtargetFeature< + "huge-private-buffer", + "EnableHugePrivateBuffer", + "true", + "Enable private/scratch buffer sizes greater than 128 GB" +>; + def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1129,6 +1129,13 @@ MVT::i32)); } +static bool ptrAddWillNotOverflow(SelectionDAG &DAG, SDValue Addr) { + return Addr.getOpcode() == ISD::OR || + (Addr.getOpcode() == ISD::ADD && + (Addr->getFlags().hasNoUnsignedWrap() || + DAG.SignBitIsZero(Addr.getOperand(0)))); +} + bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, @@ -1169,7 +1176,7 @@ // Offsets in vaddr must be positive. ConstantSDNode *C1 = cast(N1); - if (isLegalMUBUFImmOffset(C1)) { + if (isLegalMUBUFImmOffset(C1) && ptrAddWillNotOverflow(*CurDAG, Addr)) { std::tie(VAddr, SOffset) = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -130,6 +130,7 @@ bool DebuggerEmitPrologue; // Used as options. + bool EnableHugePrivateBuffer; bool EnableVGPRSpilling; bool EnablePromoteAlloca; bool EnableLoadStoreOpt; @@ -339,6 +340,10 @@ return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } + bool enableHugePrivateBuffer() const { + return EnableHugePrivateBuffer; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -121,6 +121,7 @@ DebuggerReserveRegs(false), DebuggerEmitPrologue(false), + EnableHugePrivateBuffer(false), EnableVGPRSpilling(false), EnablePromoteAlloca(false), EnableLoadStoreOpt(false), Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -128,6 +128,7 @@ bool isLegalGlobalAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; + SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; unsigned isCFIntrinsic(const SDNode *Intr) const; void createDebuggerPrologueStackObjects(MachineFunction &MF) const; @@ -277,6 +278,12 @@ SDValue V) const; void finalizeLowering(MachineFunction &MF) const override; + + void computeKnownBitsForFrameIndex(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; }; } // End namespace llvm Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -167,6 +167,7 @@ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + //setOperationAction(ISD::FrameIndex, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); @@ -3216,7 +3217,8 @@ return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); - + case ISD::FrameIndex: + return lowerFrameIndex(Op, DAG); case ISD::TRAP: case ISD::DEBUGTRAP: return lowerTRAP(Op, DAG); @@ -3278,6 +3280,43 @@ } } +SDValue SITargetLowering::lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + SDLoc SL(Op); + FrameIndexSDNode *FINode = cast(Op); + unsigned FrameIndex = FINode->getIndex(); + + // A FrameIndex node represents a 32-bit offset into scratch memory. If the + // high bit of a frame index offset were to be set, this would mean that it + // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch + // buffer, with 64 being the number of threads per wave. + // + // The maximum private allocation for the entire GPU is 4G, and we are + // concerned with the largest the index could ever be for an individual + // workitem. This will occur with the minmum dispatch size. If a program + // requires more, the dispatch size will be reduced. + // + // With this limit, we can mark the high bit of the FrameIndex node as known + // zero, which is important, because it means in most situations we can prove + // that values derived from FrameIndex nodes are non-negative. This enables us + // to take advantage of more addressing modes when accessing scratch buffers, + // since for scratch reads/writes, the register offset must always be + // positive. + + uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; + + // XXX - It is unclear if partial dispatch works. Assume it works at half wave + // granularity. It is probably a full wave. + uint64_t MinGranularity = 32; + + unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); + + SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, + DAG.getValueType(ExtVT)); +} + /// \brief Helper function for LowerBRCOND static SDNode *findUser(SDValue Value, unsigned Opcode) { @@ -6884,3 +6923,21 @@ TargetLoweringBase::finalizeLowering(MF); } + +void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth) const { + TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, + DAG, Depth); + + if (getSubtarget()->enableHugePrivateBuffer()) + return; + + // Technically it may be possible to have a dispatch with a single workitem + // that uses the full private memory size, but that's not really useful. We + // can't use vaddr in MUBUF instructions if we don't know the address + // calculation won't overflow, so assume the sign bit is never set. + Known.Zero.setHighBits(1); +} Index: test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -1,12 +1,12 @@ -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC @@ -391,7 +391,8 @@ ; FUNC-LABEL: ptrtoint: ; SI-NOT: ds_write ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ; +; SI: v_add_i32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5, +; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32] %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a Index: test/CodeGen/AMDGPU/huge-private-buffer.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/huge-private-buffer.ll @@ -0,0 +1,31 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_small: +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN-NOT: [[FI]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] +define amdgpu_kernel void @scratch_buffer_known_high_bit_small() #0 { + %alloca = alloca i32, align 4 + store volatile i32 0, i32* %alloca + %toint = ptrtoint i32* %alloca to i32 + %masked = and i32 %toint, 2147483647 + store volatile i32 %masked, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_huge: +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN-DAG: buffer_store_dword +; GCN-DAG: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x7ffffffc, [[FI]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] +define amdgpu_kernel void @scratch_buffer_known_high_bit_huge() #1 { + %alloca = alloca i32, align 4 + store volatile i32 0, i32* %alloca + %toint = ptrtoint i32* %alloca to i32 + %masked = and i32 %toint, 2147483647 + store volatile i32 %masked, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "target-features"="+huge-private-buffer" } Index: test/CodeGen/AMDGPU/scratch-buffer.ll =================================================================== --- test/CodeGen/AMDGPU/scratch-buffer.ll +++ test/CodeGen/AMDGPU/scratch-buffer.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; When a frame index offset is more than 12-bits, make sure we don't store ; it in mubuf's offset field. @@ -86,8 +86,20 @@ ret void } -; GCN-LABEL: {{^}}neg_vaddr_offset: +; GCN-LABEL: {{^}}neg_vaddr_offset_inbounds: ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}} +define amdgpu_kernel void @neg_vaddr_offset_inbounds(i32 %offset) { +entry: + %array = alloca [8192 x i32] + %ptr_offset = add i32 %offset, 4 + %ptr = getelementptr inbounds [8192 x i32], [8192 x i32]* %array, i32 0, i32 %ptr_offset + store i32 0, i32* %ptr + ret void +} + +; GCN-LABEL: {{^}}neg_vaddr_offset: +; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) { entry: %array = alloca [8192 x i32]