Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -21,19 +21,8 @@ static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, const MachineFrameInfo *FrameInfo) { - if (!FuncInfo->hasSpilledSGPRs()) - return false; - - if (FuncInfo->hasSpilledVGPRs()) - return false; - - for (int I = FrameInfo->getObjectIndexBegin(), - E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { - if (!FrameInfo->isSpillSlotObjectIndex(I)) - return false; - } - - return true; + return FuncInfo->hasSpilledSGPRs() && + (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects()); } static ArrayRef getAllSGPR128() { @@ -67,6 +56,8 @@ static_cast(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); const AMDGPUSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineBasicBlock::iterator I = MBB.begin(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); @@ -84,6 +75,44 @@ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } + if (MFI->hasFlatScratchInit() /*&& MFI->hasFlatInstructions()*/) { + // We don't need this if we only have spills since there is no user facing + // scratch. + + // TODO: If we know we don't have flat instructions earlier, we can omit + // this from the input registers. + // + // TODO: We only need to know if we access scratch space through a flat + // pointer. Because we only detect if flat instructions are used at all, + // this will be used more often than necessary on VI. + + DebugLoc DL; + + unsigned FlatScratchInitReg + = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); + + MRI.addLiveIn(FlatScratchInitReg); + MBB.addLiveIn(FlatScratchInitReg); + + // Copy the size in bytes. + unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitHi, RegState::Kill); + + unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + + // Add wave offset in bytes to private base offset. + // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + + // Convert offset to 256-byte units. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) + .addReg(FlatScrInitLo, RegState::Kill) + .addImm(8); + } + // If we reserved the original input registers, we don't need to copy to the // reserved registers. if (ScratchRsrcReg == PreloadedPrivateBufferReg) { @@ -96,7 +125,6 @@ // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); @@ -160,7 +188,6 @@ assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - MachineBasicBlock::iterator I = MBB.begin(); DebugLoc DL; if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -695,6 +695,12 @@ CCInfo.AllocateReg(InputPtrReg); } + if (Info->hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(FlatScratchInitReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector Chains; @@ -822,8 +828,11 @@ // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. - bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + // Record that we know we have non-spill stack objects so we don't need to + // check all stack objects later. + if (HasStackObjects) + Info->setHasNonSpillStackObjects(true); if (ST.isAmdHsaOS()) { // TODO: Assume we will spill without optimizations. Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -564,43 +564,11 @@ AMDGPU::EXEC).addReg(AMDGPU::EXEC); } - // FIXME: This seems inappropriate to do here. if (NeedFlat && MFI->IsKernel) { - // Insert the prologue initializing the SGPRs pointing to the scratch space - // for flat accesses. - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - // TODO: What to use with function calls? - - // FIXME: This is reporting stack size that is used in a scratch buffer - // rather than registers as well. - uint64_t StackSizeBytes = FrameInfo->getStackSize(); - - int IndirectBegin - = static_cast(TII)->getIndirectIndexBegin(MF); - // Convert register index to 256-byte unit. - uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); - - assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && - "Stack limits should be smaller than 16-bits"); - - // Initialize the flat scratch register pair. - // TODO: Can we use one s_mov_b64 here? - - // Offset is in units of 256-bytes. - MachineBasicBlock &MBB = MF.front(); - DebugLoc NoDL; - MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); - const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); - - assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); - - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) - .addImm(StackOffset); - - // Documentation says size is "per-thread scratch size in bytes" - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) - .addImm(StackSizeBytes); + // We will need to Initialize the flat scratch register pair. + if (NeedFlat) + MFI->setHasFlatInstructions(true); } return true; Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -73,6 +73,8 @@ private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; + bool HasNonSpillStackObjects; + bool HasFlatInstructions; // Feature bits required for inputs passed in user SGPRs. bool PrivateSegmentBuffer : 1; @@ -129,6 +131,7 @@ unsigned addDispatchPtr(const SIRegisterInfo &TRI); unsigned addQueuePtr(const SIRegisterInfo &TRI); unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + unsigned addFlatScratchInit(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -277,6 +280,22 @@ HasSpilledVGPRs = Spill; } + bool hasNonSpillStackObjects() const { + return HasNonSpillStackObjects; + } + + void setHasNonSpillStackObjects(bool StackObject = true) { + HasNonSpillStackObjects = StackObject; + } + + bool hasFlatInstructions() const { + return HasFlatInstructions; + } + + void setHasFlatInstructions(bool UseFlat = true) { + HasFlatInstructions = UseFlat; + } + unsigned getPSInputAddr() const { return PSInputAddr; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -54,6 +54,8 @@ NumSystemSGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), + HasNonSpillStackObjects(false), + HasFlatInstructions(false), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), @@ -93,6 +95,11 @@ if (F->hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; + bool MaySpill = ST.isVGPRSpillingEnabled(this); bool HasStackObjects = FrameInfo->hasStackObjects(); @@ -107,10 +114,12 @@ DispatchPtr = true; } - // X, XY, and XYZ are the only supported combinations, so make sure Y is - // enabled if Z is. - if (WorkItemIDZ) - WorkItemIDY = true; + // We don't need to worry about accessing spills with flat instructions. + // TODO: On VI where we must use flat for global, we should be able to omit + // this if it is never used for generic access. + if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS && + ST.isAmdHsaOS()) + FlatScratchInit = true; } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -142,6 +151,13 @@ return KernargSegmentPtrUserSGPR; } +unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { + FlatScratchInitUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return FlatScratchInitUserSGPR; +} + SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( MachineFunction *MF, unsigned FrameIndex, Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -117,10 +117,12 @@ enum PreloadedValue { // SGPRS: - PRIVATE_SEGMENT_BUFFER = 0, + PRIVATE_SEGMENT_BUFFER = 0, DISPATCH_PTR = 1, QUEUE_PTR = 2, KERNARG_SEGMENT_PTR = 3, + DISPATCH_ID = 4, + FLAT_SCRATCH_INIT = 5, WORKGROUP_ID_X = 10, WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -624,6 +624,11 @@ case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR; + case SIRegisterInfo::DISPATCH_ID: + llvm_unreachable("unimplemented"); + case SIRegisterInfo::FLAT_SCRATCH_INIT: + assert(MFI->hasFlatScratchInit()); + return MFI->FlatScratchInitUserSGPR; case SIRegisterInfo::DISPATCH_PTR: assert(MFI->hasDispatchPtr()); return MFI->DispatchPtrUserSGPR; Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -190,8 +190,6 @@ } ; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: -; VI-DAG: s_movk_i32 flat_scratch_lo, 0x0 -; VI-DAG: s_movk_i32 flat_scratch_hi, 0x0 ; GCN: s_and_saveexec_b64 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] Index: test/CodeGen/AMDGPU/ci-use-flat-for-global.ll =================================================================== --- test/CodeGen/AMDGPU/ci-use-flat-for-global.ll +++ test/CodeGen/AMDGPU/ci-use-flat-for-global.ll @@ -1,11 +1,22 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s +; There are no stack objects even though flat is used by default, so +; flat_scratch_init should be disabled. + +; ALL-LABEL: {{^}}test: +; HSA: .amd_kernel_code_t +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: .end_amd_kernel_code_t + +; ALL-NOT: flat_scr + ; HSA-DEFAULT: flat_store_dword ; HSA-NODEFAULT: buffer_store_dword + ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword define void @test(i32 addrspace(1)* %out) { Index: test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics.ll +++ test/CodeGen/AMDGPU/global_atomics.ll @@ -24,8 +24,6 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { @@ -38,8 +36,6 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset: ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -71,8 +67,6 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_addr64: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -83,8 +77,6 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64: ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -117,8 +109,6 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset: ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -130,8 +120,6 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset: ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -163,8 +151,6 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_addr64: ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -175,8 +161,6 @@ ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64: ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -209,8 +193,6 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset: ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -222,8 +204,6 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset: ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -255,8 +235,6 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64: ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -267,8 +245,6 @@ ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64: ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -301,8 +277,6 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset: ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -314,8 +288,6 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset: ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -347,8 +319,6 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_addr64: ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -359,8 +329,6 @@ ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64: ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -393,8 +361,6 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -406,8 +372,6 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset: ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -439,8 +403,6 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -451,8 +413,6 @@ ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64: ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -485,8 +445,6 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -498,8 +456,6 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset: ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -531,8 +487,6 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_addr64: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -543,8 +497,6 @@ ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64: ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -577,8 +529,6 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -590,8 +540,6 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset: ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -623,8 +571,6 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -635,8 +581,6 @@ ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64: ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -669,8 +613,6 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -682,8 +624,6 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset: ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -715,8 +655,6 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_addr64: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -727,8 +665,6 @@ ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64: ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -771,8 +707,6 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset: ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -804,8 +738,6 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64: ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -816,8 +748,6 @@ ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64: ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -850,8 +780,6 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -863,8 +791,6 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset: ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { @@ -896,8 +822,6 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: @@ -908,8 +832,6 @@ ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64: ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: s_movk_i32 flat_scratch_lo, 0x0 -; VI: s_movk_i32 flat_scratch_hi, 0x0 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { Index: test/CodeGen/AMDGPU/hsa.ll =================================================================== --- test/CodeGen/AMDGPU/hsa.ll +++ test/CodeGen/AMDGPU/hsa.ll @@ -28,7 +28,7 @@ ; ELF: Symbol { ; ELF: Name: simple -; ELF: Size: 296 +; ELF: Size: 288 ; ELF: Type: AMDGPU_HSA_KERNEL (0xA) ; ELF: } Index: test/CodeGen/AMDGPU/large-alloca-compute.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-compute.ll +++ test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -17,7 +17,7 @@ ; GCNHSA: .amd_kernel_code_t ; GCNHSA: compute_pgm_rsrc2_scratch_en = 1 -; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6 +; GCNHSA: compute_pgm_rsrc2_user_sgpr = 8 ; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1 ; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0 ; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0 @@ -29,7 +29,7 @@ ; GCNHSA: enable_sgpr_queue_ptr = 0 ; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1 ; GCNHSA: enable_sgpr_dispatch_id = 0 -; GCNHSA: enable_sgpr_flat_scratch_init = 0 +; GCNHSA: enable_sgpr_flat_scratch_init = 1 ; GCNHSA: enable_sgpr_private_segment_size = 0 ; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0 ; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0 @@ -39,8 +39,8 @@ ; GCNHSA: .end_amd_kernel_code_t -; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen -; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen ; Scratch size = alloca size + emergency stack slot ; ALL: ; ScratchSize: 32772 Index: test/CodeGen/AMDGPU/private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/private-memory.ll +++ test/CodeGen/AMDGPU/private-memory.ll @@ -41,6 +41,10 @@ ; HSA-ALLOCA: workitem_private_segment_byte_size = 24 ; HSA-ALLOCA: .end_amd_kernel_code_t +; HSA-ALLOCA: s_mov_b32 flat_scratch_lo, s7 +; HSA-ALLOCA: s_add_u32 s6, s6, s9 +; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8 + ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s -; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s +; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s +; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s ; This ends up using all 256 registers and requires register ; scavenging which will fail to find an unsued register. @@ -11,18 +11,20 @@ ; FIXME: The same register is initialized to 0 for every spill. -declare i32 @llvm.r600.read.tgid.x() #1 -declare i32 @llvm.r600.read.tgid.y() #1 -declare i32 @llvm.r600.read.tgid.z() #1 - ; GCN-LABEL: {{^}}spill_vgpr_compute: -; GCN: s_mov_b32 s16, s3 -; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s15, 0x80f000 -; VI-NEXT: s_mov_b32 s15, 0x800000 +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: workitem_private_segment_byte_size = 1024 + +; GCN-NOT: flat_scr + +; GCNMESA: s_mov_b32 s16, s3 +; GCNMESA: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCNMESA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCNMESA-NEXT: s_mov_b32 s14, -1 +; SIMESA-NEXT: s_mov_b32 s15, 0x80f000 +; VIMESA-NEXT: s_mov_b32 s15, 0x800000 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill