diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -413,6 +413,7 @@ const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); assert(MFI->isEntryFunction()); @@ -490,7 +491,7 @@ Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) - .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST)); + .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); } if (hasFP(MF)) { @@ -499,13 +500,18 @@ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } - if ((MFI->hasFlatScratchInit() || ScratchRsrcReg) && + bool NeedsFlatScratchInit = + MFI->hasFlatScratchInit() && + (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || + (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); + + if ((NeedsFlatScratchInit || ScratchRsrcReg) && !ST.flatScratchIsArchitected()) { MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } - if (MFI->hasFlatScratchInit()) { + if (NeedsFlatScratchInit) { emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -178,14 +178,14 @@ if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) KernargSegmentPtr = true; + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls or stack objects that may require it before argument + // lowering. if (ST.hasFlatAddressSpace() && isEntryFunction() && (isAmdHsaOrMesa || ST.enableFlatScratch()) && + (HasCalls || HasStackObjects || ST.enableFlatScratch()) && !ST.flatScratchIsArchitected()) { - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. - if (HasCalls || HasStackObjects || ST.enableFlatScratch()) - FlatScratchInit = true; + FlatScratchInit = true; } Attribute A = F.getFnAttribute("amdgpu-git-ptr-high"); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -8,17 +8,15 @@ ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_movk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 -; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_mov_b32 s33, 0 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -26,16 +24,12 @@ ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s9 -; GFX10-NEXT: s_movk_i32 s32, 0x200 -; GFX10-NEXT: s_mov_b32 s33, 0 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_add_u32 s0, s0, s9 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX10-NEXT: s_and_b32 s4, s4, -16 @@ -110,17 +104,15 @@ ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_movk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 -; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_mov_b32 s33, 0 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -128,16 +120,12 @@ ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s9 -; GFX10-NEXT: s_movk_i32 s32, 0x200 -; GFX10-NEXT: s_mov_b32 s33, 0 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_add_u32 s0, s0, s9 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX10-NEXT: s_and_b32 s4, s4, -16 @@ -212,18 +200,16 @@ ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 -; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_mov_b32 s33, 0 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -231,16 +217,12 @@ ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s9 -; GFX10-NEXT: s_movk_i32 s32, 0x400 -; GFX10-NEXT: s_mov_b32 s33, 0 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_add_u32 s0, s0, s9 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_movk_i32 s32, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX10-NEXT: s_and_b32 s4, s4, -16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -28,8 +28,6 @@ ; TODO: Could optimize out in this case ; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls: -; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7 -; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0 ; RO-FLAT-NOT: flat_scratch ; RW-FLAT: buffer_store_dword ; RO-FLAT: scratch_store_dword @@ -41,7 +39,8 @@ ; RW-FLAT-NOT: .amdhsa_enable_private_segment ; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset ; RO-FLAT: .amdhsa_enable_private_segment 1 -; GCN-NOT: .amdhsa_reserve_flat_scratch +; RW-FLAT: .amdhsa_reserve_flat_scratch 0 +; RO-FLAT-NOT: .amdhsa_reserve_flat_scratch ; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1 ; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -13,15 +13,13 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xc @@ -86,15 +84,13 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_movk_i32 s32, 0x1000 +; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/call-constant.ll b/llvm/test/CodeGen/AMDGPU/call-constant.ll --- a/llvm/test/CodeGen/AMDGPU/call-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constant.ll @@ -1,12 +1,7 @@ ; RUN: llc -global-isel=0 -amdgpu-fixed-function-abi=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,SDAG %s ; RUN: llc -global-isel=1 -amdgpu-fixed-function-abi=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,GISEL %s -; FIXME: Emitting unnecessary flat_scratch setup - ; GCN-LABEL: {{^}}test_call_undef: -; SDAG: s_mov_b32 flat_scratch_lo, s13 -; SDAG: s_add_i32 s12, s12, s17 -; SDAG: s_lshr_b32 ; GCN: s_endpgm define amdgpu_kernel void @test_call_undef() #0 { %val = call i32 undef(i32 1) @@ -26,10 +21,6 @@ } ; GCN-LABEL: {{^}}test_call_null: -; SDAG: s_mov_b32 flat_scratch_lo, s13 -; SDAG: s_add_i32 s12, s12, s17 -; SDAG: s_lshr_b32 - ; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm define amdgpu_kernel void @test_call_null() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -22,20 +22,15 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 @@ -45,10 +40,6 @@ ; ; GFX1010-LABEL: test_kern_stack: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 @@ -188,21 +179,16 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 @@ -213,13 +199,9 @@ ; ; GFX1010-LABEL: test_force_fp_kern_stack: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 -; GFX1010-NEXT: s_mov_b32 s33, 0 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 @@ -344,14 +326,11 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; GFX803-LABEL: test_sgpr_offset_kernel: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_mov_b32 s4, 0x40000 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ;;#ASMEND @@ -364,18 +343,16 @@ ; ; GFX900-LABEL: test_sgpr_offset_kernel: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_mov_b32 s6, 0x40000 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b32 s4, 0x40000 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s6, 0x40000 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b32 s4, 0x40000 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -383,21 +360,17 @@ ; ; GFX1010-LABEL: test_sgpr_offset_kernel: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b32 s6, 0x20000 +; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc ; GFX1010-NEXT: s_waitcnt vmcnt(0) -; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-NEXT: s_mov_b32 s6, 0x20000 +; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: ;;#ASMSTART ; GFX1010-NEXT: ;;#ASMEND -; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -357,8 +357,6 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 { ; GFX900-LABEL: vload2_private: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s9 @@ -420,10 +418,6 @@ ; ; GFX10_DEFAULT-LABEL: vload2_private: ; GFX10_DEFAULT: ; %bb.0: ; %entry -; GFX10_DEFAULT-NEXT: s_add_u32 s6, s6, s9 -; GFX10_DEFAULT-NEXT: s_addc_u32 s7, s7, 0 -; GFX10_DEFAULT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10_DEFAULT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0 ; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s9 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -181,7 +181,6 @@ ; GCN-NEXT: v_mov_b32_e32 v0, vcc_lo ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s2, exec_lo ; GCN-NEXT: s_mov_b32 exec_lo, 3 ; GCN-NEXT: s_mov_b32 s3, 0 @@ -198,6 +197,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_mov_b32 exec_lo, s2 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: global_store_dword v1, v0, s[0:1] ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{s[0:7]}" () @@ -254,10 +254,6 @@ define amdgpu_kernel void @kernel_no_calls_no_stack() { ; GCN-LABEL: kernel_no_calls_no_stack: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s0, s3 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GCN-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GCN-NEXT: s_endpgm ret void } diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -3,7 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,GFX9,ALL %s ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 < %s -mattr=-flat-for-global | FileCheck --check-prefixes=GCNHSA,ALL %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,GFX10HSA,ALL %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s ; FIXME: align on alloca seems to be ignored for private_segment_alignment @@ -18,12 +18,6 @@ ; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000 ; GFX9-DAG: s_mov_b32 s{{[0-9]+}}, 0xe00000 - -; GFX10HSA: s_add_u32 [[FLAT_SCR_LO:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} -; GFX10HSA-DAG: s_addc_u32 [[FLAT_SCR_HI:s[0-9]+]], s{{[0-9]+}}, 0 -; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), [[FLAT_SCR_LO]] -; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), [[FLAT_SCR_HI]] - ; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen ; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -20,8 +20,6 @@ ; MUBUF-LABEL: local_stack_offset_uses_sp: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; MUBUF-NEXT: s_add_u32 s0, s0, s9 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 @@ -204,8 +202,6 @@ ; MUBUF-LABEL: local_stack_offset_uses_sp_flat: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; MUBUF-NEXT: s_add_u32 s0, s0, s9 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -25,10 +25,6 @@ ; ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 -; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c @@ -108,10 +104,6 @@ ; ; GCN-SCRATCH-LABEL: scalar_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 -; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: s_clause 0x1 ; GCN-SCRATCH-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x2c @@ -277,10 +269,6 @@ ; ; GCN-SCRATCH-LABEL: vector_clause_indirect: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 -; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -15,8 +15,6 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { ; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; MUBUF-NEXT: s_add_u32 s0, s0, s9 ; MUBUF-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 @@ -119,8 +117,6 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { ; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; MUBUF-NEXT: s_add_u32 s0, s0, s9 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -37,8 +37,8 @@ %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) - ; MUBUF: s_mov_b32 s6, 0x40000 - ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill + ; MUBUF: s_mov_b32 s4, 0x40000 + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill ; FLATSCR: s_movk_i32 s2, 0x1000 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr @@ -149,9 +149,9 @@ %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff00 / 64 = 4092 (for wave64) - ; MUBUF: s_mov_b32 s6, 0x3ff00 - ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill - ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill + ; MUBUF: s_mov_b32 s4, 0x3ff00 + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]] ; 8-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -6,12 +6,9 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI-LABEL: max_alignment_128: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s4, s4, s7 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -37,6 +34,7 @@ ; VI-NEXT: .amdhsa_next_free_vgpr 1 ; VI-NEXT: .amdhsa_next_free_sgpr 8 ; VI-NEXT: .amdhsa_reserve_vcc 0 +; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 ; VI-NEXT: .amdhsa_float_round_mode_16_64 0 ; VI-NEXT: .amdhsa_float_denorm_mode_32 3 @@ -55,8 +53,6 @@ ; ; GFX9-LABEL: max_alignment_128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 @@ -85,6 +81,7 @@ ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 ; GFX9-NEXT: .amdhsa_next_free_sgpr 8 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 +; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 ; GFX9-NEXT: .amdhsa_float_round_mode_32 0 ; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0 @@ -110,12 +107,9 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-LABEL: stackrealign_attr: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s4, s4, s7 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -141,6 +135,7 @@ ; VI-NEXT: .amdhsa_next_free_vgpr 1 ; VI-NEXT: .amdhsa_next_free_sgpr 8 ; VI-NEXT: .amdhsa_reserve_vcc 0 +; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 ; VI-NEXT: .amdhsa_float_round_mode_16_64 0 ; VI-NEXT: .amdhsa_float_denorm_mode_32 3 @@ -159,8 +154,6 @@ ; ; GFX9-LABEL: stackrealign_attr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 @@ -189,6 +182,7 @@ ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 ; GFX9-NEXT: .amdhsa_next_free_sgpr 8 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 +; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 ; GFX9-NEXT: .amdhsa_float_round_mode_32 0 ; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0 @@ -214,12 +208,9 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; VI-LABEL: alignstack_attr: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s4, s4, s7 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -245,6 +236,7 @@ ; VI-NEXT: .amdhsa_next_free_vgpr 1 ; VI-NEXT: .amdhsa_next_free_sgpr 8 ; VI-NEXT: .amdhsa_reserve_vcc 0 +; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 ; VI-NEXT: .amdhsa_float_round_mode_16_64 0 ; VI-NEXT: .amdhsa_float_denorm_mode_32 3 @@ -263,8 +255,6 @@ ; ; GFX9-LABEL: alignstack_attr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 @@ -293,6 +283,7 @@ ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 ; GFX9-NEXT: .amdhsa_next_free_sgpr 8 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 +; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 ; GFX9-NEXT: .amdhsa_float_round_mode_32 0 ; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0