Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -346,6 +346,13 @@ def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; +def FeatureEnableHugePrivateBuffer : SubtargetFeature< + "huge-private-buffer", + "EnableHugePrivateBuffer", + "true", + "Enable private/scratch buffer sizes greater than 128 GB" +>; + def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1160,8 +1160,13 @@ SDValue N1 = Addr.getOperand(1); // Offsets in vaddr must be positive. + // + // The total computation of vaddr + soffset + offset must not overflow. + // If vaddr is negative, even if offset is 0 the sgpr offset add will end up + // overflowing. ConstantSDNode *C1 = cast(N1); - if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) && + CurDAG->SignBitIsZero(N0)) { std::tie(VAddr, SOffset) = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -130,6 +130,7 @@ bool DebuggerEmitPrologue; // Used as options. + bool EnableHugePrivateBuffer; bool EnableVGPRSpilling; bool EnablePromoteAlloca; bool EnableLoadStoreOpt; @@ -351,6 +352,10 @@ return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } + bool enableHugePrivateBuffer() const { + return EnableHugePrivateBuffer; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -121,6 +121,7 @@ DebuggerReserveRegs(false), DebuggerEmitPrologue(false), + EnableHugePrivateBuffer(false), EnableVGPRSpilling(false), EnablePromoteAlloca(false), EnableLoadStoreOpt(false), Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -277,6 +277,12 @@ SDValue V) const; void finalizeLowering(MachineFunction &MF) const override; + + void computeKnownBitsForFrameIndex(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; }; } // End namespace llvm Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -94,6 +94,12 @@ cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); +static cl::opt AssumeFrameIndexHighZeroBits( + "amdgpu-frame-index-zero-bits", + cl::desc("High bits of frame index assumed to be zero"), + cl::init(5), + cl::ReallyHidden); + static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -1600,6 +1606,17 @@ Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) { + // The return object should be reasonably addressable. + + // FIXME: This helps when the return is a real sret. If it is a + // automatically inserted sret (i.e. CanLowerReturn returns false), an + // extra copy is inserted in SelectionDAGBuilder which obscures this. + unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; + Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, + DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); + } + // If this is an 8 or 16-bit value, it is really passed promoted // to 32 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. @@ -3216,7 +3233,6 @@ return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); - case ISD::TRAP: case ISD::DEBUGTRAP: return lowerTRAP(Op, DAG); @@ -6997,3 +7013,21 @@ TargetLoweringBase::finalizeLowering(MF); } + +void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth) const { + TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, + DAG, Depth); + + if (getSubtarget()->enableHugePrivateBuffer()) + return; + + // Technically it may be possible to have a dispatch with a single workitem + // that uses the full private memory size, but that's not really useful. We + // can't use vaddr in MUBUF instructions if we don't know the address + // calculation won't overflow, so assume the sign bit is never set. + Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); +} Index: test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -1,12 +1,12 @@ -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC @@ -391,7 +391,8 @@ ; FUNC-LABEL: ptrtoint: ; SI-NOT: ds_write ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ; +; SI: v_add_i32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5, +; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32] %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a Index: test/CodeGen/AMDGPU/array-ptr-calc-i32.ll =================================================================== --- test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -30,14 +30,14 @@ %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) %a_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inB, i32 %tid - %a = load i32, i32 addrspace(1)* %a_ptr - %b = load i32, i32 addrspace(1)* %b_ptr + %a = load i32, i32 addrspace(1)* %a_ptr, !range !0 + %b = load i32, i32 addrspace(1)* %b_ptr, !range !0 %result = add i32 %a, %b %alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b store i32 %result, i32* %alloca_ptr, align 4 ; Dummy call call void @llvm.amdgcn.s.barrier() - %reload = load i32, i32* %alloca_ptr, align 4 + %reload = load i32, i32* %alloca_ptr, align 4, !range !0 %out_ptr = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid store i32 %reload, i32 addrspace(1)* %out_ptr, align 4 ret void @@ -46,3 +46,5 @@ attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind convergent } + +!0 = !{i32 0, i32 65536 } Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -101,6 +101,9 @@ ret void } +; FIXME: Should be able to see that this can use vaddr, but the +; FrameIndex is hidden behind a CopyFromReg in the second block. + ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: ; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s5, s4 ; GCN: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 @@ -108,7 +111,7 @@ ; GCN: s_and_saveexec_b64 ; GCN: v_add_i32_e32 v0, vcc, 4, [[ADD]] -; GCN: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4 +; GCN: buffer_load_dword v1, v0, s[0:3], s4 offen{{$}} ; GCN: ds_write_b32 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 { %cmp = icmp eq i32 %arg2, 0 @@ -195,4 +198,23 @@ ret void } +; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s5 offset:12 +define void @alloca_ptr_nonentry_block(i32 %arg0) #0 { + %alloca0 = alloca { i8, i32 }, align 4 + %cmp = icmp eq i32 %arg0, 0 + br i1 %cmp, label %bb, label %ret + +bb: + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %alloca0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %alloca0, i32 0, i32 1 + %load1 = load volatile i32, i32* %gep1 + store volatile i32* %gep1, i32* addrspace(3)* undef + br label %ret + +ret: + ret void +} + attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- test/CodeGen/AMDGPU/function-returns.ll +++ test/CodeGen/AMDGPU/function-returns.ll @@ -385,40 +385,116 @@ ret void } +; FIXME: Should be able to fold offsets in all of these. Call lowering +; introduces an extra CopyToReg/CopyFromReg obscuring the AssertZext +; inserted. Not using it introduces the spills. + ; GCN-LABEL: {{^}}v33i32_func_void: +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill + ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_4:v[0-9]+]], vcc, 4, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_4]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_8:v[0-9]+]], vcc, 8, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_8]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_12:v[0-9]+]], vcc, 12, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_12]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_16:v[0-9]+]], vcc, 16, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_16]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_20:v[0-9]+]], vcc, 20, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_20]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_24:v[0-9]+]], vcc, 24, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_24]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_28:v[0-9]+]], vcc, 28, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_28]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_32:v[0-9]+]], vcc, 32, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_32]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_36:v[0-9]+]], vcc, 36, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_36]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_40:v[0-9]+]], vcc, 40, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_40]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_44:v[0-9]+]], vcc, 44, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_44]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_48:v[0-9]+]], vcc, 48, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_48]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_52:v[0-9]+]], vcc, 52, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_52]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_56:v[0-9]+]], vcc, 56, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_56]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_60:v[0-9]+]], vcc, 60, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_60]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_64:v[0-9]+]], vcc, 64, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_64]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_68:v[0-9]+]], vcc, 0x44, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_68]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_72:v[0-9]+]], vcc, 0x48, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_72]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_76:v[0-9]+]], vcc, 0x4c, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_76]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_80:v[0-9]+]], vcc, 0x50, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_80]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_84:v[0-9]+]], vcc, 0x54, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_84]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_88:v[0-9]+]], vcc, 0x58, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_88]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_92:v[0-9]+]], vcc, 0x5c, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_92]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_96:v[0-9]+]], vcc, 0x60, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_96]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_100:v[0-9]+]], vcc, 0x64, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_100]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_104:v[0-9]+]], vcc, 0x68, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_104]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_108:v[0-9]+]], vcc, 0x6c, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_108]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_112:v[0-9]+]], vcc, 0x70, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_112]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_116:v[0-9]+]], vcc, 0x74, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_116]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_120:v[0-9]+]], vcc, 0x78, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_120]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_124:v[0-9]+]], vcc, 0x7c, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_124]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_128:v[0-9]+]], vcc, 0x80, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_128]], s[0:3], s4 offen{{$}} + +; GCN: buffer_load_dword v34 +; GCN: buffer_load_dword v33 +; GCN: buffer_load_dword v32 ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <33 x i32> @v33i32_func_void() #0 { @@ -428,39 +504,111 @@ } ; GCN-LABEL: {{^}}struct_v32i32_i32_func_void: +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill + ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_4:v[0-9]+]], vcc, 4, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_4]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_8:v[0-9]+]], vcc, 8, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_8]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_12:v[0-9]+]], vcc, 12, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_12]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_16:v[0-9]+]], vcc, 16, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_16]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_20:v[0-9]+]], vcc, 20, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_20]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_24:v[0-9]+]], vcc, 24, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_24]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_28:v[0-9]+]], vcc, 28, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_28]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_32:v[0-9]+]], vcc, 32, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_32]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_36:v[0-9]+]], vcc, 36, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_36]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_40:v[0-9]+]], vcc, 40, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_40]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_44:v[0-9]+]], vcc, 44, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_44]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_48:v[0-9]+]], vcc, 48, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_48]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_52:v[0-9]+]], vcc, 52, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_52]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_56:v[0-9]+]], vcc, 56, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_56]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_60:v[0-9]+]], vcc, 60, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_60]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_64:v[0-9]+]], vcc, 64, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_64]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_68:v[0-9]+]], vcc, 0x44, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_68]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_72:v[0-9]+]], vcc, 0x48, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_72]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_76:v[0-9]+]], vcc, 0x4c, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_76]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_80:v[0-9]+]], vcc, 0x50, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_80]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_84:v[0-9]+]], vcc, 0x54, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_84]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_88:v[0-9]+]], vcc, 0x58, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_88]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_92:v[0-9]+]], vcc, 0x5c, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_92]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_96:v[0-9]+]], vcc, 0x60, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_96]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_100:v[0-9]+]], vcc, 0x64, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_100]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_104:v[0-9]+]], vcc, 0x68, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_104]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_108:v[0-9]+]], vcc, 0x6c, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_108]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_112:v[0-9]+]], vcc, 0x70, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_112]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_116:v[0-9]+]], vcc, 0x74, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_116]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_120:v[0-9]+]], vcc, 0x78, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_120]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_124:v[0-9]+]], vcc, 0x7c, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_124]], s[0:3], s4 offen{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_128:v[0-9]+]], vcc, 0x80, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_128]], s[0:3], s4 offen{{$}} + +; GCN: buffer_load_dword v34 +; GCN: buffer_load_dword v33 +; GCN: buffer_load_dword v32 ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { @@ -470,39 +618,20 @@ } ; GCN-LABEL: {{^}}struct_i32_v32i32_func_void: +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill + ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}} + +; GCN-DAG: v_add_i32_e32 [[ADD_128:v[0-9]+]], vcc, 0x80, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_128]], s[0:3], s4 offen{{$}} + + +; GCN-DAG: v_add_i32_e32 [[ADD_256:v[0-9]+]], vcc, 0xfc, v0 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_256]], s[0:3], s4 offen{{$}} + +; GCN: buffer_load_dword v33 +; GCN: buffer_load_dword v32 ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { Index: test/CodeGen/AMDGPU/huge-private-buffer.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/huge-private-buffer.ll @@ -0,0 +1,31 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_small: +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN-NOT: [[FI]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] +define amdgpu_kernel void @scratch_buffer_known_high_bit_small() #0 { + %alloca = alloca i32, align 4 + store volatile i32 0, i32* %alloca + %toint = ptrtoint i32* %alloca to i32 + %masked = and i32 %toint, 2147483647 + store volatile i32 %masked, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_huge: +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN-DAG: buffer_store_dword +; GCN-DAG: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x7ffffffc, [[FI]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] +define amdgpu_kernel void @scratch_buffer_known_high_bit_huge() #1 { + %alloca = alloca i32, align 4 + store volatile i32 0, i32* %alloca + %toint = ptrtoint i32* %alloca to i32 + %masked = and i32 %toint, 2147483647 + store volatile i32 %masked, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "target-features"="+huge-private-buffer" } Index: test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/load-hi16.ll +++ test/CodeGen/AMDGPU/load-hi16.ll @@ -295,16 +295,16 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}} +; GFX9: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}} -define void @load_private_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { +; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +define void @load_private_hi_v2i16_reglo_vreg(i16* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i16, i16* %in, i64 2047 + %gep = getelementptr inbounds i16, i16* %in, i64 2045 %load = load i16, i16* %gep %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 @@ -314,16 +314,16 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}} +; GFX9: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}} -define void @load_private_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { +; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +define void @load_private_hi_v2f16_reglo_vreg(half* byval %in, half %reg) #0 { entry: - %gep = getelementptr inbounds half, half* %in, i64 2047 + %gep = getelementptr inbounds half, half* %in, i64 2045 %load = load half, half* %gep %build0 = insertelement <2 x half> undef, half %reg, i32 0 %build1 = insertelement <2 x half> %build0, half %load, i32 1 @@ -333,14 +333,14 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9: buffer_load_short_d16_hi v0, off, s[0:3], s4 offset:4094{{$}} +; GFX9: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} -define void @load_private_hi_v2i16_reglo_vreg_nooff(i16* %in, i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg_nooff(i16* byval %in, i16 %reg) #0 { entry: %load = load volatile i16, i16* inttoptr (i32 4094 to i16*) %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 @@ -369,16 +369,16 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, v0, s[0:3], s4 offen offset:2047{{$}} +; GFX9: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: buffer_load_ubyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}} -define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { +; VI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i8, i8* %in, i64 2047 + %gep = getelementptr inbounds i8, i8* %in, i64 4091 %load = load i8, i8* %gep %ext = zext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 @@ -389,16 +389,16 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_sbyte_d16_hi v1, v0, s[0:3], s4 offen offset:2047{{$}} +; GFX9: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: buffer_load_sbyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}} -define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { +; VI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i8, i8* %in, i64 2047 + %gep = getelementptr inbounds i8, i8* %in, i64 4091 %load = load i8, i8* %gep %ext = sext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 @@ -641,13 +641,12 @@ ; FIXME: Is there a cost to using the extload over not? ; GCN-LABEL: {{^}}load_private_v2i16_split: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], s4 offen{{$}} +; GFX9: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}} ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:2 +; GFX9-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6 ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 -define <2 x i16> @load_private_v2i16_split(i16* %in) #0 { +define <2 x i16> @load_private_v2i16_split(i16* byval %in) #0 { entry: %gep = getelementptr inbounds i16, i16* %in, i32 1 %load0 = load volatile i16, i16* %in Index: test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- test/CodeGen/AMDGPU/load-lo16.ll +++ test/CodeGen/AMDGPU/load-lo16.ll @@ -340,17 +340,17 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}} +; GFX9: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}} -define void @load_private_lo_v2i16_reglo_vreg(i16* %in, i32 %reg) #0 { +; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +define void @load_private_lo_v2i16_reglo_vreg(i16* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i16, i16* %in, i64 2047 + %gep = getelementptr inbounds i16, i16* %in, i64 2045 %load = load i16, i16* %gep %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -359,7 +359,7 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen offset:4094{{$}} +; GFX9: buffer_load_ushort v1, off, s[0:3], s5 offset:4094{{$}} ; GFX9-NEXT: s_waitcnt ; GFX9: v_and_b32 ; GFX9: v_lshl_or_b32 @@ -368,10 +368,10 @@ ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}} -define void @load_private_lo_v2i16_reghi_vreg(i16* %in, i16 %reg) #0 { +; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +define void @load_private_lo_v2i16_reghi_vreg(i16* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i16, i16* %in, i64 2047 + %gep = getelementptr inbounds i16, i16* %in, i64 2045 %load = load i16, i16* %gep %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 @@ -381,17 +381,17 @@ ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}} +; GFX9: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}} -define void @load_private_lo_v2f16_reglo_vreg(half* %in, i32 %reg) #0 { +; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +define void @load_private_lo_v2f16_reglo_vreg(half* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> - %gep = getelementptr inbounds half, half* %in, i64 2047 + %gep = getelementptr inbounds half, half* %in, i64 2045 %load = load half, half* %gep %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 store <2 x half> %build1, <2 x half> addrspace(1)* undef @@ -454,17 +454,17 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_ubyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}} +; GFX9: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: buffer_load_ubyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}} -define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { +; VI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i8, i8* %in, i64 2047 + %gep = getelementptr inbounds i8, i8* %in, i64 4091 %load = load i8, i8* %gep %ext = zext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 @@ -474,17 +474,17 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_load_sbyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}} +; GFX9: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} ; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 -; VI: buffer_load_sbyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}} -define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 { +; VI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i8, i8* %in, i64 2047 + %gep = getelementptr inbounds i8, i8* %in, i64 4091 %load = load i8, i8* %gep %ext = sext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 Index: test/CodeGen/AMDGPU/scratch-buffer.ll =================================================================== --- test/CodeGen/AMDGPU/scratch-buffer.ll +++ test/CodeGen/AMDGPU/scratch-buffer.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; When a frame index offset is more than 12-bits, make sure we don't store ; it in mubuf's offset field. @@ -86,8 +86,21 @@ ret void } +; GCN-LABEL: {{^}}neg_vaddr_offset_inbounds: +; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} +define amdgpu_kernel void @neg_vaddr_offset_inbounds(i32 %offset) { +entry: + %array = alloca [8192 x i32] + %ptr_offset = add i32 %offset, 4 + %ptr = getelementptr inbounds [8192 x i32], [8192 x i32]* %array, i32 0, i32 %ptr_offset + store i32 0, i32* %ptr + ret void +} + ; GCN-LABEL: {{^}}neg_vaddr_offset: -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}} +; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) { entry: %array = alloca [8192 x i32] Index: test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/store-hi16.ll +++ test/CodeGen/AMDGPU/store-hi16.ll @@ -440,18 +440,18 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset: ; GCN: s_waitcnt -; GFX9-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}} +; GFX9: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen offset:4094{{$}} +; VI: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], s5 offset:4094{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @store_private_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 { +define void @store_private_hi_v2i16_max_offset(i16* byval %out, i32 %arg) #0 { entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - %gep = getelementptr inbounds i16, i16* %out, i64 2047 + %gep = getelementptr inbounds i16, i16* %out, i64 2045 store i16 %hi, i16* %gep ret void }