diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1497,11 +1497,6 @@ return false; } -static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { - auto PSV = PtrInfo.V.dyn_cast(); - return PSV && PSV->isStack(); -} - std::pair AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { SDLoc DL(N); @@ -1538,13 +1533,7 @@ AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits); VAddr = SDValue(MovHighBits, 0); - // In a call sequence, stores to the argument stack area are relative to the - // stack pointer. - const MachinePointerInfo &PtrInfo - = cast(Parent)->getPointerInfo(); - SOffset = isStackPtrRelative(PtrInfo) - ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) - : CurDAG->getTargetConstant(0, DL, MVT::i32); + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); return true; } @@ -1587,28 +1576,52 @@ return true; } +static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) { + if (Val.getOpcode() != ISD::CopyFromReg) + return false; + auto RC = + TRI.getPhysRegClass(cast(Val.getOperand(1))->getReg()); + return RC && TRI.isSGPRClass(RC); +} + bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset) const { - ConstantSDNode *CAddr = dyn_cast(Addr); - if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) - return false; - - SDLoc DL(Addr); + const SIRegisterInfo *TRI = + static_cast(Subtarget->getRegisterInfo()); MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo(); + SDLoc DL(Addr); - SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + // CopyFromReg + if (IsCopyFromSGPR(*TRI, Addr)) { + SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; + } - const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); + ConstantSDNode *CAddr; + if (Addr.getOpcode() == ISD::ADD) { + // Add (CopyFromReg ) + CAddr = dyn_cast(Addr.getOperand(1)); + if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) + return false; + if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0))) + return false; - // FIXME: Get from MachinePointerInfo? We should only be using the frame - // offset if we know this is in a call sequence. - SOffset = isStackPtrRelative(PtrInfo) - ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) - : CurDAG->getTargetConstant(0, DL, MVT::i32); + SOffset = Addr.getOperand(0); + } else if ((CAddr = dyn_cast(Addr)) && + SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) { + // + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + } else { + return false; + } + + SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; @@ -1890,19 +1903,21 @@ } // Match (32-bit SGPR base) + sext(imm offset) -bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N, - SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, SDValue &SAddr, SDValue &Offset) const { if (Addr->isDivergent()) return false; - SAddr = Addr; + SDLoc DL(Addr); + int64_t COffsetVal = 0; if (CurDAG->isBaseWithConstantOffset(Addr)) { COffsetVal = cast(Addr.getOperand(1))->getSExtValue(); SAddr = Addr.getOperand(0); + } else { + SAddr = Addr; } SAddr = SelectSAddrFI(CurDAG, SAddr); @@ -1917,14 +1932,15 @@ COffsetVal = SplitImmOffset; - SDLoc DL(N); SDValue AddOffset = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SAddr.getOpcode() == ISD::TargetFrameIndex + ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL) + : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32); SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32, SAddr, AddOffset), 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4156,8 +4156,13 @@ int64_t Offset) const { MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); + const SIMachineFunctionInfo *Info = MF.getInfo(); SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); + // Stores to the argument stack area are relative to the stack pointer. + SDValue SP = + DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32); + Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr); SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4), MachineMemOperand::MODereferenceable); return Store; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3692,11 +3692,6 @@ }}; } -static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { - auto PSV = PtrInfo.V.dyn_cast(); - return PSV && PSV->isStack(); -} - InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); @@ -3818,18 +3813,13 @@ const MachineFunction *MF = MBB->getParent(); const SIMachineFunctionInfo *Info = MF->getInfo(); - const MachineMemOperand *MMO = *MI->memoperands_begin(); - const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); return {{ [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); }, [=](MachineInstrBuilder &MIB) { // soffset - if (isStackPtrRelative(PtrInfo)) - MIB.addReg(Info->getStackPtrOffsetReg()); - else - MIB.addImm(0); + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3123,7 +3123,10 @@ // locations, which are supposed to be immutable? Chain = addTokenForArgument(Chain, DAG, MFI, FI); } else { - DstAddr = PtrOff; + // Stores to the argument stack area are relative to the stack pointer. + SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(), + MVT::i32); + DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff); DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); Alignment = commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -494,11 +494,11 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN-DAG: s_movk_i32 s32, 0x400 ; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 +; GCN: s_movk_i32 s32, 0x400 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -609,10 +609,10 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: ; VARABI: enable_vgpr_workitem_id = 0 ; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; VARABI: s_movk_i32 s32, 0x400{{$}} ; VARABI: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 -; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 +; VARABI: s_movk_i32 s32, 0x400{{$}} +; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]], @@ -656,8 +656,8 @@ ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: ; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; VARABI: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} +; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]], ; VARABI: s_swappc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -160,14 +160,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 @@ -257,7 +256,6 @@ ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -266,7 +264,7 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s2 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -356,7 +354,6 @@ ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -365,7 +362,7 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s2 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -4184,7 +4181,6 @@ ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 @@ -4205,7 +4201,7 @@ ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8) -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s2 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -4967,14 +4963,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 @@ -8351,16 +8346,10 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 20 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 16 -; GFX10-SCRATCH-NEXT: s_mov_b32 s20, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s21, 8 -; GFX10-SCRATCH-NEXT: s_mov_b32 s22, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s23, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s40, 4 @@ -8385,19 +8374,12 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s51 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s50 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s49 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s48 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s47 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s46 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s2 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s3 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s20 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v3, s21 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v4, s22 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v5, s23 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 ; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 ; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 @@ -8408,6 +8390,8 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 17 @@ -8631,20 +8615,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 24 -; GFX10-SCRATCH-NEXT: s_mov_b32 s20, 20 -; GFX10-SCRATCH-NEXT: s_mov_b32 s21, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s22, 12 -; GFX10-SCRATCH-NEXT: s_mov_b32 s23, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s24, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 3 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s40, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s41, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s42, 6 @@ -8657,42 +8633,39 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 13 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 15 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg@rel32@hi+12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 8 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s50 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s49 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s3 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s51 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s48 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s47 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s46 -; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s20 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s21 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s22 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v3, s2 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v4, s23 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v5, s24 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 ; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 ; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 ; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 +; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 ; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s32 offset:24 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 17 @@ -8804,21 +8777,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 4 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v32, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: scratch_load_dword v33, off, s33 ; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 0 +; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v32, s2 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s3 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -8977,26 +8944,20 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 15 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 12 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 12 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 8 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 4 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 2 @@ -9024,8 +8985,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 11 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32@rel32@hi+12 @@ -9216,34 +9175,20 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 15 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 14 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 28 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 24 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 11 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 12 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 20 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 10 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 12 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 9 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 8 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 8 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 8 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 9 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s1 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -9276,8 +9221,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 7 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32@rel32@hi+12 @@ -9464,34 +9407,20 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41700000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41600000 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 28 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 24 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41300000 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41500000 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41400000 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 20 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41200000 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 12 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41100000 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 8 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41000000 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v0, s0 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41400000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41500000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41600000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x41700000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41100000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s0 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s1 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -9524,8 +9453,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 0x40e00000 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_u32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -66,16 +66,15 @@ ; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 ; FLATSCR-NEXT: s_mov_b32 s2, s32 -; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 -; FLATSCR-NEXT: s_add_i32 s4, s2, s3 +; FLATSCR-NEXT: s_add_i32 s3, s2, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; FLATSCR-NEXT: s_add_u32 s2, s2, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: s_add_u32 s2, s2, s3 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 ; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s4 -; FLATSCR-NEXT: s_add_i32 s4, s4, s2 -; FLATSCR-NEXT: scratch_load_dword v2, off, s4 +; FLATSCR-NEXT: s_mov_b32 s32, s3 +; FLATSCR-NEXT: s_add_i32 s3, s3, s2 +; FLATSCR-NEXT: scratch_load_dword v2, off, s3 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 @@ -255,7 +254,7 @@ ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s5, s33 +; FLATSCR-NEXT: s_mov_b32 s4, s33 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_u32 s32, s32, 16 @@ -267,16 +266,15 @@ ; FLATSCR-NEXT: s_cbranch_execz BB2_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 ; FLATSCR-NEXT: s_mov_b32 s2, s32 -; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 -; FLATSCR-NEXT: s_add_i32 s4, s2, s3 +; FLATSCR-NEXT: s_add_i32 s3, s2, 0x1000 +; FLATSCR-NEXT: s_add_u32 s2, s2, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 -; FLATSCR-NEXT: s_add_u32 s2, s2, s3 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 -; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s3 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 -; FLATSCR-NEXT: s_mov_b32 s32, s4 +; FLATSCR-NEXT: s_mov_b32 s32, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off @@ -286,7 +284,7 @@ ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_sub_u32 s32, s32, 16 -; FLATSCR-NEXT: s_mov_b32 s33, s5 +; FLATSCR-NEXT: s_mov_b32 s33, s4 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: