Index: llvm/include/llvm/CodeGen/TargetFrameLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -139,10 +139,13 @@ /// int getOffsetOfLocalArea() const { return LocalAreaOffset; } - /// isFPCloseToIncomingSP - Return true if the frame pointer is close to - /// the incoming stack pointer, false if it is close to the post-prologue - /// stack pointer. - virtual bool isFPCloseToIncomingSP() const { return true; } + /// Control the placement of special register scavenging spill slots when + /// allocating a stack frame. + /// + /// If this returns true, the frame indexes used by the RegScavenger will be + /// allocated closest to the incoming stack pointer. + virtual bool allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const; /// assignCalleeSavedSpillSlots - Allows target to override spill slot /// assignment logic. If implemented, assignCalleeSavedSpillSlots() should Index: llvm/lib/CodeGen/PrologEpilogInserter.cpp =================================================================== --- llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -902,9 +902,7 @@ // incoming stack pointer if a frame pointer is required and is closer // to the incoming rather than the final stack pointer. const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - bool EarlyScavengingSlots = (TFI.hasFP(MF) && TFI.isFPCloseToIncomingSP() && - RegInfo->useFPForScavengingIndex(MF) && - !RegInfo->hasStackRealignment(MF)); + bool EarlyScavengingSlots = TFI.allocateScavengingFrameIndexesNearIncomingSP(MF); if (RS && EarlyScavengingSlots) { SmallVector SFIs; RS->getScavengingFrameIndices(SFIs); Index: llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp =================================================================== --- llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -136,6 +136,16 @@ return 0; } +bool TargetFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const { + if (!hasFP(MF)) + return false; + + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + return RegInfo->useFPForScavengingIndex(MF) && + !RegInfo->hasStackRealignment(MF); +} + bool TargetFrameLowering::isSafeForNoCSROpt(const Function &F) { if (!F.hasLocalLinkage() || F.hasAddressTaken() || !F.hasFnAttribute(Attribute::NoRecurse)) Index: llvm/lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -43,6 +43,9 @@ const TargetRegisterInfo *TRI, std::vector &CSI) const override; + bool allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const override; + bool isSupportedStackID(TargetStackID::Value ID) const override; void processFunctionBeforeFrameFinalized( Index: llvm/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1364,6 +1364,34 @@ return false; } +bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const { + + const GCNSubtarget &ST = MF.getSubtarget(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint64_t EstStackSize = MFI.estimateStackSize(MF); + uint64_t MaxOffset = EstStackSize - 1; + + // We need the emergency stack slots to be allocated in range of the + // MUBUF/flat scratch immediate offset from the base register, so assign these + // first at the incoming SP position. + // + // TODO: We could try sorting the objects to find a hole in the first bytes + // rather than allocating as close to possible. This could save a lot of space + // on frames with alignment requirements. + if (ST.enableFlatScratch()) { + const SIInstrInfo *TII = ST.getInstrInfo(); + if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch)) + return false; + } else { + if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset)) + return false; + } + + return true; +} + MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, Index: llvm/lib/Target/Mips/MipsFrameLowering.h =================================================================== --- llvm/lib/Target/Mips/MipsFrameLowering.h +++ llvm/lib/Target/Mips/MipsFrameLowering.h @@ -34,7 +34,10 @@ bool hasBP(const MachineFunction &MF) const; - bool isFPCloseToIncomingSP() const override { return false; } + bool allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const override { + return false; + } bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; Index: llvm/lib/Target/SystemZ/SystemZFrameLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -29,7 +29,18 @@ create(const SystemZSubtarget &STI); // Override TargetFrameLowering. - bool isFPCloseToIncomingSP() const override { return false; } + bool allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const override { + // SystemZ wants normal register scavenging slots, as close to the stack or + // frame pointer as possible. + // The default implementation assumes an x86-like layout, where the frame + // pointer is at the opposite end of the frame from the stack pointer. + // This meant that when frame pointer elimination was disabled, + // the slots ended up being as close as possible to the incoming + // stack pointer, which is the opposite of what we want on SystemZ. + return false; + } + bool hasReservedCallFrame(const MachineFunction &MF) const override; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, @@ -43,7 +54,6 @@ SystemZELFFrameLowering(); // Override TargetFrameLowering. - bool isFPCloseToIncomingSP() const override { return false; } bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -465,9 +465,9 @@ ; GFX9-LABEL: store_load_vindex_large_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: scratch_load_dword v1, off, s32 glc +; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 @@ -486,14 +486,14 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_store_dword v0, v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -568,10 +568,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_movk_i32 s0, 0x3e80 -; GFX9-NEXT: scratch_store_dword off, v0, s32 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s0, s32 +; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -585,8 +586,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3e80 -; GFX10-NEXT: s_add_i32 s0, s0, s32 -; GFX10-NEXT: scratch_store_dword off, v0, s32 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 Index: llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -55,8 +55,8 @@ ; MUBUF-NEXT: s_addk_i32 s32, 0x200 ; FLATSCR-NEXT: s_add_i32 s32, s32, 8 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}} -; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}} +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33{{$}} +; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_addk_i32 s32, 0xfe00 ; FLATSCR-NEXT: s_add_i32 s32, s32, -8 @@ -242,8 +242,8 @@ ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; MUBUF-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 -; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:8 +; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 +; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN: ;;#ASMSTART ; GCN-NEXT: ; clobber v41 @@ -270,8 +270,8 @@ ; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 v0, s33, 63 ; GCN-COUNT-60: v_writelane_b32 v0 @@ -280,8 +280,8 @@ ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; GCN: v_writelane_b32 v0 -; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 -; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8 +; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4 +; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4 ; GCN: ;;#ASMSTART ; GCN: v_writelane_b32 v0 @@ -291,8 +291,8 @@ ; FLATSCR: s_add_i32 s32, s32, -16 ; GCN-NEXT: v_readlane_b32 s33, v0, 63 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -316,8 +316,8 @@ ; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-COUNT-61: v_writelane_b32 v0, ; FLATSCR: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 @@ -340,8 +340,8 @@ ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -369,14 +369,16 @@ ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 -; MUBUF-NEXT: s_add_i32 s32, s32, 0x100000 -; FLATSCR-NEXT: s_addk_i32 s32, 0x4000 +; MUBUF-NEXT: s_add_i32 s32, s32, 0x180000 +; FLATSCR-NEXT: s_addk_i32 s32, 0x6000 ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; MUBUF-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33 -; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33 +; MUBUF-NEXT: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}} +; MUBUF-NEXT: buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}} +; FLATSCR-NEXT: s_add_i32 vcc_hi, s33, 0x2000 +; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], vcc_hi ; GCN-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_add_i32 s32, s32, 0xfff00000 -; FLATSCR-NEXT: s_addk_i32 s32, 0xc000 +; MUBUF-NEXT: s_add_i32 s32, s32, 0xffe80000 +; FLATSCR-NEXT: s_addk_i32 s32, 0xa000 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { @@ -388,16 +390,16 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 v0, s33, 2 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: v_writelane_b32 v0, s30, 0 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN: v_writelane_b32 v0, s31, 1 -; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 -; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 +; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} +; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART ; MUBUF: s_addk_i32 s32, 0x300 @@ -410,8 +412,8 @@ ; FLATSCR-NEXT: s_add_i32 s32, s32, -12 ; GCN-NEXT: v_readlane_b32 s33, v0, 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[4:5] @@ -434,8 +436,8 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 ; GCN-NEXT: s_mov_b32 s33, s32 @@ -456,8 +458,8 @@ ; FLATSCR-NEXT: s_add_i32 s32, s32, -12{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -486,9 +488,9 @@ ; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset: ; GCN: s_waitcnt ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100 ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008 +; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 @@ -509,9 +511,9 @@ ; FLATSCR-NEXT: s_addk_i32 s32, 0xeff4{{$}} ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100 ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload -; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008 +; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -567,10 +569,10 @@ ; With no free registers, we must spill the FP to memory. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: ; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33 -; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4 +; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 ; 4-byte Folded Spill ; FLATSCR: s_mov_b32 s0, s33 ; GCN: s_mov_b32 s33, s32 -; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4 +; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Reload ; FLATSCR: s_mov_b32 s33, s0 ; MUBUF: s_waitcnt vmcnt(0) ; MUBUF: v_readfirstlane_b32 s33, [[TMP_VGPR2]] @@ -669,14 +671,14 @@ ; scratch VGPR to hold the offset. ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset ; MUBUF: s_or_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100 ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; MUBUF: v_mov_b32_e32 v0, s33 ; GCN-NOT: v_mov_b32_e32 v0, 0x100c -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40300 +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200 ; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; FLATSCR: v_mov_b32_e32 v0, 0 -; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1004 +; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000 ; FLATSCR: scratch_store_dword off, v0, [[SOFF]] define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval([4096 x i8]) align 4 %arg) #3 { %alloca = alloca i32, addrspace(5) Index: llvm/test/CodeGen/AMDGPU/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -1485,7 +1485,7 @@ ; GFX9-LABEL: zero_init_large_offset_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, off, s32 glc +; GFX9-NEXT: scratch_load_dword v0, off, s32 offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 s1, s0 @@ -1495,13 +1495,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1510,10 +1510,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc +; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -1522,11 +1522,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1534,7 +1534,7 @@ ; GFX9-PAL-LABEL: zero_init_large_offset_foo: ; GFX9-PAL: ; %bb.0: ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc +; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 @@ -1544,13 +1544,13 @@ ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] @@ -1559,10 +1559,10 @@ ; GFX1010-PAL: ; %bb.0: ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 ; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 @@ -1572,13 +1572,13 @@ ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo ; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 ; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] @@ -1587,10 +1587,10 @@ ; GFX1030-PAL: ; %bb.0: ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc +; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 -; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 @@ -1599,11 +1599,11 @@ ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo -; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 -; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] @@ -2015,9 +2015,9 @@ ; GFX9-LABEL: store_load_vindex_large_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: scratch_load_dword v1, off, s32 glc +; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 ; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 @@ -2034,12 +2034,12 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc +; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_store_dword v0, v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2050,9 +2050,9 @@ ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc +; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 @@ -2069,12 +2069,12 @@ ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo ; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2218,9 +2218,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-NEXT: scratch_store_dword off, v0, s32 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, s32 +; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2235,8 +2236,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-NEXT: s_add_i32 s0, s0, s32 -; GFX10-NEXT: scratch_store_dword off, v0, s32 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo +; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2249,9 +2251,10 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s0, s32 +; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2266,8 +2269,9 @@ ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-PAL-NEXT: s_add_i32 s0, s0, s32 -; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo +; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 Index: llvm/test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -806,15 +806,14 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: ; GFX900-MUBUF: buffer_store_dword ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4058 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR: scratch_store_dword ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4058 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 { entry: - %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i16], align 2, addrspace(5) %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc @@ -829,15 +828,14 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: ; GFX900-MUBUF: buffer_store_dword ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4059 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR: scratch_store_dword ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4059 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 { entry: - %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc @@ -853,15 +851,14 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: ; GFX900-MUBUF: buffer_store_dword ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4059 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR: scratch_store_dword ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4059 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { +define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 { entry: - %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc Index: llvm/test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1913,9 +1913,10 @@ ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 glc +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 44 +; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, v1, s[0:3], s32 offen offset:4054 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1925,9 +1926,10 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc +; GFX906-NEXT: v_mov_b32_e32 v3, 44 +; GFX906-NEXT: buffer_load_ushort v1, v3, s[0:3], s32 offen offset:4054 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 @@ -1939,9 +1941,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc +; GFX803-NEXT: v_mov_b32_e32 v2, 44 +; GFX803-NEXT: buffer_load_ushort v1, v2, s[0:3], s32 offen offset:4054 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1953,9 +1956,10 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 offset:4 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 glc +; GFX900-FLATSCR-NEXT: s_add_i32 vcc_hi, s32, 44 +; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, vcc_hi offset:4054 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1978,9 +1982,10 @@ ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 44 +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1990,9 +1995,10 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc +; GFX906-NEXT: v_mov_b32_e32 v3, 44 +; GFX906-NEXT: buffer_load_sbyte v1, v3, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 @@ -2004,9 +2010,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc +; GFX803-NEXT: v_mov_b32_e32 v2, 44 +; GFX803-NEXT: buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2018,9 +2025,10 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 offset:4 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc +; GFX900-FLATSCR-NEXT: s_add_i32 vcc_hi, s32, 44 +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, vcc_hi offset:4055 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -2044,9 +2052,10 @@ ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 44 +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -2056,9 +2065,10 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc +; GFX906-NEXT: v_mov_b32_e32 v3, 44 +; GFX906-NEXT: buffer_load_ubyte v1, v3, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 @@ -2070,9 +2080,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc +; GFX803-NEXT: v_mov_b32_e32 v2, 44 +; GFX803-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 @@ -2085,9 +2096,10 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 offset:4 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc +; GFX900-FLATSCR-NEXT: s_add_i32 vcc_hi, s32, 44 +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, vcc_hi offset:4055 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -2111,9 +2123,10 @@ ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 44 +; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -2123,9 +2136,10 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc +; GFX906-NEXT: v_mov_b32_e32 v2, 44 +; GFX906-NEXT: buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -2138,9 +2152,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc +; GFX803-NEXT: v_mov_b32_e32 v2, 44 +; GFX803-NEXT: buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2152,9 +2167,10 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 offset:4 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc +; GFX900-FLATSCR-NEXT: s_add_i32 vcc_hi, s32, 44 +; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, vcc_hi offset:4055 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -2179,9 +2195,10 @@ ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 44 +; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -2191,9 +2208,10 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc +; GFX906-NEXT: v_mov_b32_e32 v2, 44 +; GFX906-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -2206,9 +2224,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc +; GFX803-NEXT: v_mov_b32_e32 v2, 44 +; GFX803-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 @@ -2221,9 +2240,10 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 +; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32 offset:4 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc +; GFX900-FLATSCR-NEXT: s_add_i32 vcc_hi, s32, 44 +; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, vcc_hi offset:4055 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -112,12 +112,13 @@ ; MUBUF-NEXT: s_add_i32 s33, s32, 0x7ffc0 ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 ; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3 ; MUBUF-NEXT: v_add_u32_e32 v2, 64, v3 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0 +; MUBUF-NEXT: v_mov_b32_e32 v5, 0x2000 ; MUBUF-NEXT: s_mov_b32 s4, 0 -; MUBUF-NEXT: s_add_i32 s32, s32, 0x180000 -; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s33 +; MUBUF-NEXT: s_add_i32 s32, s32, 0x200000 +; MUBUF-NEXT: buffer_store_dword v4, v5, s[0:3], s33 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: BB1_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -129,7 +130,7 @@ ; MUBUF-NEXT: s_cbranch_scc1 BB1_1 ; MUBUF-NEXT: ; %bb.2: ; %split ; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3 ; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -139,7 +140,7 @@ ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_add_i32 s32, s32, 0xffe80000 +; MUBUF-NEXT: s_add_i32 s32, s32, 0xffe00000 ; MUBUF-NEXT: s_mov_b32 s33, s5 ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 ; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc @@ -153,14 +154,15 @@ ; FLATSCR-NEXT: s_mov_b32 s2, s33 ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 +; FLATSCR-NEXT: s_add_i32 s32, s32, 0x8000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; FLATSCR-NEXT: s_add_i32 vcc_hi, s33, 0x2000 ; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: s_addk_i32 s32, 0x6000 -; FLATSCR-NEXT: scratch_store_dword off, v2, s33 +; FLATSCR-NEXT: scratch_store_dword off, v2, vcc_hi ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: BB1_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 vcc_hi, s33, 0x1000 +; FLATSCR-NEXT: s_add_i32 vcc_hi, s33, 0x3000 ; FLATSCR-NEXT: s_add_i32 s1, s0, vcc_hi ; FLATSCR-NEXT: s_add_i32 s0, s0, 1 ; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 @@ -169,14 +171,14 @@ ; FLATSCR-NEXT: s_cbranch_scc1 BB1_1 ; FLATSCR-NEXT: ; %bb.2: ; %split ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1000 +; FLATSCR-NEXT: s_add_i32 s1, s33, 0x3000 ; FLATSCR-NEXT: s_add_i32 s0, s0, s1 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_add_i32 s0, s33, 0x1000 +; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_addk_i32 s32, 0xa000 +; FLATSCR-NEXT: s_addk_i32 s32, 0x8000 ; FLATSCR-NEXT: s_mov_b32 s33, s2 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc Index: llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x200 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffe00 ; CHECK-NEXT: s_mov_b32 s33, s4 Index: llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -30,25 +30,29 @@ ; CHECK: liveins: $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc + ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 + ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc + ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc + ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc @@ -85,16 +89,20 @@ ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33 + ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc + ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33 + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr29 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -130,14 +138,16 @@ ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 8192, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr29 + ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 16384, implicit-def $scc ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr29 ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -172,14 +182,16 @@ ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31 ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK-NEXT: $vcc_lo = S_MOV_B32 8192 + ; CHECK-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr0, 0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $vcc_lo = S_MOV_B32 16384 ; CHECK-NEXT: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28 ; CHECK-NEXT: S_ENDPGM 0 S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31 Index: llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -26,22 +26,23 @@ ; MUBUF: liveins: $vgpr1, $vgpr2 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc + ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc + ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; MUBUF-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec + ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 - ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc + ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; MUBUF-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; MUBUF-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc + ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc @@ -49,22 +50,24 @@ ; FLATSCR: liveins: $vgpr1, $vgpr2 ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 8196, implicit-def $scc + ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def $scc ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc + ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc - ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -8192, implicit-def $scc - ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc + ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc + ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc ; FLATSCR-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 8196, implicit-def $scc + ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def $scc ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; FLATSCR-NEXT: S_ENDPGM 0, implicit $vcc Index: llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir +++ llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -25,19 +25,25 @@ ; CHECK: liveins: $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 262400, implicit-def $scc + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 262080, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294705152, implicit-def dead $scc - ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 524288, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 4096, implicit-def $scc + ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -524288, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -786432, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 262400, implicit-def $scc + ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc Index: llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -27,73 +27,73 @@ ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc + ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX8-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; GFX8-NEXT: $sgpr7 = S_ADD_I32 $sgpr33, 524800, implicit-def $scc - ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr7, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) - ; GFX8-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 8192 + ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr0, 0, implicit $exec + ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; GFX8-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; GFX8-NEXT: $vcc_lo = S_MOV_B32 16384 ; GFX8-NEXT: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec ; GFX8-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; GFX8-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; GFX8-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc + ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc ; GFX8-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; GFX8-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 524800, implicit-def $scc - ; GFX8-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; GFX8-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) ; GFX8-NEXT: S_ENDPGM 0, csr_amdgpu_allvgprs ; GFX9-LABEL: name: pei_scavenge_vgpr_spill ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc + ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; GFX9-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX9-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; GFX9-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc + ; GFX9-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX9-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; GFX9-NEXT: $sgpr7 = S_ADD_I32 $sgpr33, 524800, implicit-def $scc - ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr7, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; GFX9-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) ; GFX9-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; GFX9-NEXT: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec + ; GFX9-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec ; GFX9-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec - ; GFX9-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc + ; GFX9-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; GFX9-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; GFX9-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc + ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; GFX9-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 524800, implicit-def $scc - ; GFX9-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; GFX9-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) ; GFX9-NEXT: S_ENDPGM 0, csr_amdgpu_allvgprs ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill ; GFX9-FLATSCR: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 8196, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; GFX9-FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX9-FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec + ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr33, 8192, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vcc_hi, implicit $exec + ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr33, 16384, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $vcc_hi, $vgpr1, implicit $exec - ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc ; GFX9-FLATSCR-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 8196, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9-FLATSCR-NEXT: S_ENDPGM 0, csr_amdgpu_allvgprs Index: llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -78,10 +78,10 @@ ; 0x40000 / 64 = 4096 (for wave64) %a = load volatile i32, i32 addrspace(5)* %aptr - ; MUBUF: s_add_i32 s32, s32, 0x40000 + ; MUBUF: s_add_i32 s32, s32, 0x40100 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill - ; MUBUF: s_add_i32 s32, s32, 0xfffc0000 - ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1000 + ; MUBUF: s_add_i32 s32, s32, 0xfffbff00 + ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) @@ -97,10 +97,10 @@ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 - ; MUBUF: s_add_i32 s32, s32, 0x40000 + ; MUBUF: s_add_i32 s32, s32, 0x40100 ; MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload - ; MUBUF: s_add_i32 s32, s32, 0xfffc0000 - ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1000 + ; MUBUF: s_add_i32 s32, s32, 0xfffbff00 + ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload ; Force %a to spill with no free SGPRs @@ -173,14 +173,16 @@ ; GCN-LABEL: test_inst_offset_function define void @test_inst_offset_function() { entry: - ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in - ; the instruction offset field. - %alloca = alloca i8, i32 4092, align 4, addrspace(5) + ; Occupy enough bytes of scratch, so the offset of the spill of %a + ; just fits in the instruction offset field when the emergency stack + ; slot is added. It's hard to hit the actual limit since we're also + ; going to insert the emergency stack slot for large frames. + %alloca = alloca i8, i32 4088, align 4, addrspace(5) %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 - ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill - ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill + ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill. @@ -202,9 +204,9 @@ %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) - ; MUBUF: s_add_i32 s4, s32, 0x40000 + ; MUBUF: s_add_i32 s4, s32, 0x40100 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill - ; FLATSCR: s_add_i32 s0, s32, 0x1000 + ; FLATSCR: s_add_i32 s0, s32, 0x1004 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr @@ -220,16 +222,21 @@ ; GCN-LABEL: test_sgpr_offset_subregs_function define void @test_sgpr_offset_subregs_function() { entry: - ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a - ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in + ; We want to test the spill of the last subreg of %a is the highest + ; valid value for the immediate offset. We enable the emergency + ; stack slot for large frames, so it's hard to get the frame layout + ; exactly as we want to test it. + ; + ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a + ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in ; the instruction offset field. - %alloca = alloca i8, i32 4088, align 4, addrspace(5) + %alloca = alloca i8, i32 4084, align 4, addrspace(5) %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* + ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4084 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill - ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill - ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4088 ; 8-byte Folded Spill + ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4084 ; 8-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr @@ -249,14 +256,14 @@ ; GCN-LABEL: test_inst_offset_subregs_function define void @test_inst_offset_subregs_function() { entry: - ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a - ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live + ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a + ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live ; in the SGPR offset. - %alloca = alloca i8, i32 4092, align 4, addrspace(5) + %alloca = alloca i8, i32 4088, align 4, addrspace(5) %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* - ; 0x3ff00 / 64 = 4092 (for wave64) + ; 0x3ff0000 / 64 = 4092 (for wave64) ; MUBUF: s_add_i32 s4, s32, 0x3ff00 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill Index: llvm/test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -647,15 +647,14 @@ ; GCN: s_waitcnt ; GFX900-MUBUF: buffer_store_dword ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4058 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR: scratch_store_dword ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094 +; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4058 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { +define void @store_private_hi_v2i16_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 { entry: - %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i16], align 2, addrspace(5) %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc @@ -670,14 +669,13 @@ ; GCN: s_waitcnt ; GFX900-MUBUF: buffer_store_dword ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095 +; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4059 ; GFX900-FLATSCR: scratch_store_dword ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095 +; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4059 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { +define void @store_private_hi_v2i16_i8_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 { entry: - %obj0 = alloca [10 x i32], align 4, addrspace(5) %obj1 = alloca [4096 x i8], align 2, addrspace(5) %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc