diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1890,19 +1890,30 @@ } // Match (32-bit SGPR base) + sext(imm offset) -bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N, - SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, SDValue &SAddr, SDValue &Offset) const { if (Addr->isDivergent()) return false; - SAddr = Addr; + const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); + MachineFunction &MF = CurDAG->getMachineFunction(); + SDLoc DL(Addr); + int64_t COffsetVal = 0; - if (CurDAG->isBaseWithConstantOffset(Addr)) { + ConstantSDNode *CAddr = dyn_cast(Addr); + if (CAddr && isStackPtrRelative(PtrInfo)) { + COffsetVal = CAddr->getSExtValue(); + // In a call sequence, stores to the argument stack area are relative to the + // stack pointer. + const SIMachineFunctionInfo *Info = MF.getInfo(); + SAddr = CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32); + } else if (CurDAG->isBaseWithConstantOffset(Addr)) { COffsetVal = cast(Addr.getOperand(1))->getSExtValue(); SAddr = Addr.getOperand(0); + } else { + SAddr = Addr; } SAddr = SelectSAddrFI(CurDAG, SAddr); @@ -1917,14 +1928,15 @@ COffsetVal = SplitImmOffset; - SDLoc DL(N); SDValue AddOffset = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SAddr.getOpcode() == ISD::TargetFrameIndex + ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL) + : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32); SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32, SAddr, AddOffset), 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -160,14 +160,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 @@ -257,7 +256,6 @@ ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -266,7 +264,7 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s2 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -356,7 +354,6 @@ ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -365,7 +362,7 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s2 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -4184,7 +4181,6 @@ ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 @@ -4205,7 +4201,7 @@ ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8) -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s2 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -4967,14 +4963,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 @@ -8351,16 +8346,10 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 20 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 16 -; GFX10-SCRATCH-NEXT: s_mov_b32 s20, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s21, 8 -; GFX10-SCRATCH-NEXT: s_mov_b32 s22, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s23, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s40, 4 @@ -8389,15 +8378,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s50 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s49 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s48 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s47 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s46 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s2 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s3 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s20 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v3, s21 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v4, s22 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v5, s23 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 ; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 ; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 @@ -8408,6 +8388,14 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s47 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s46 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s32 offset:20 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s32 offset:12 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v3, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v4, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v5, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 17 @@ -8631,20 +8619,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 24 -; GFX10-SCRATCH-NEXT: s_mov_b32 s20, 20 -; GFX10-SCRATCH-NEXT: s_mov_b32 s21, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s22, 12 -; GFX10-SCRATCH-NEXT: s_mov_b32 s23, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s24, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 3 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s40, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s41, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s42, 6 @@ -8657,10 +8637,13 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 13 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 15 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg@rel32@hi+12 @@ -8668,31 +8651,29 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 8 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s50 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s49 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s3 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s32 offset:24 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s51 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s48 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s47 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s46 -; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s20 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s21 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s22 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v3, s2 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v4, s23 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v5, s24 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 ; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 ; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 ; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 +; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 ; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s47 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s46 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s32 offset:20 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s32 offset:12 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v3, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v4, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v5, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 17 @@ -8804,21 +8785,19 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 4 +; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v32, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: scratch_load_dword v33, off, s33 -; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v32, s2 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v32, s32 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s3 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -8977,21 +8956,18 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 15 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 12 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 14 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 12 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 8 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 4 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 13 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s32 offset:12 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v3, s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -9024,8 +9000,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 11 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32@rel32@hi+12 @@ -9218,32 +9192,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 14 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 28 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 24 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 11 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 12 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 20 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 16 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 13 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 10 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 12 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 9 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 8 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 8 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 9 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s32 offset:28 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s32 offset:24 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s32 offset:20 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 11 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 10 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s32 offset:12 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v3, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v4, s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -9276,8 +9242,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 7 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32@rel32@hi+12 @@ -9466,32 +9430,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41600000 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 28 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 24 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41300000 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41500000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41400000 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 20 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 16 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41500000 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41200000 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 12 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41100000 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 8 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41000000 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x41100000 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s32 offset:28 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s32 offset:24 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s32 offset:20 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41400000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41300000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41200000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0x41000000 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s32 offset:12 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s32 offset:8 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v3, s32 offset:4 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v4, s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -9524,8 +9480,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 0x40e00000 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -66,16 +66,15 @@ ; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 ; FLATSCR-NEXT: s_mov_b32 s2, s32 -; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 -; FLATSCR-NEXT: s_add_i32 s4, s2, s3 +; FLATSCR-NEXT: s_add_i32 s3, s2, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; FLATSCR-NEXT: s_add_u32 s2, s2, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: s_add_u32 s2, s2, s3 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 ; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s4 -; FLATSCR-NEXT: s_add_i32 s4, s4, s2 -; FLATSCR-NEXT: scratch_load_dword v2, off, s4 +; FLATSCR-NEXT: s_mov_b32 s32, s3 +; FLATSCR-NEXT: s_add_i32 s3, s3, s2 +; FLATSCR-NEXT: scratch_load_dword v2, off, s3 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 @@ -255,7 +254,7 @@ ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s5, s33 +; FLATSCR-NEXT: s_mov_b32 s4, s33 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_u32 s32, s32, 16 @@ -267,16 +266,15 @@ ; FLATSCR-NEXT: s_cbranch_execz BB2_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 ; FLATSCR-NEXT: s_mov_b32 s2, s32 -; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 -; FLATSCR-NEXT: s_add_i32 s4, s2, s3 +; FLATSCR-NEXT: s_add_i32 s3, s2, 0x1000 +; FLATSCR-NEXT: s_add_u32 s2, s2, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 -; FLATSCR-NEXT: s_add_u32 s2, s2, s3 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 -; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s3 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 -; FLATSCR-NEXT: s_mov_b32 s32, s4 +; FLATSCR-NEXT: s_mov_b32 s32, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off @@ -286,7 +284,7 @@ ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 -; FLATSCR-NEXT: s_mov_b32 s33, s5 +; FLATSCR-NEXT: s_mov_b32 s33, s4 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: