diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -75,6 +75,7 @@ ModulePass *createAMDGPUReplaceLDSUseWithPointerPass(); ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); +FunctionPass *createGCNPreRAOptimizationsPass(); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin { AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {} @@ -348,6 +349,9 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +void initializeGCNPreRAOptimizationsPass(PassRegistry &); +extern char &GCNPreRAOptimizationsID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -208,6 +208,11 @@ cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden); +static cl::opt EnablePreRAOptimizations( + "amdgpu-enable-pre-ra-optimizations", + cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), + cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -275,6 +280,7 @@ initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeGCNNSAReassignPass(*PR); + initializeGCNPreRAOptimizationsPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1191,6 +1197,11 @@ if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); + if (EnablePreRAOptimizations.getNumOccurrences() + ? EnablePreRAOptimizations + : TM->getOptLevel() > CodeGenOpt::Less) + insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); + // This is not an essential optimization and it has a noticeable impact on // compilation time, so we only enable it from O2. if (TM->getOptLevel() > CodeGenOpt::Less) diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -143,6 +143,7 @@ GCNILPSched.cpp GCNNSAReassign.cpp GCNDPPCombine.cpp + GCNPreRAOptimizations.cpp SIModeRegister.cpp LINK_COMPONENTS diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -0,0 +1,162 @@ +//===-- GCNPreRAOptimizations.cpp -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass combines split register tuple initialization into a single psuedo: +/// +/// undef %0.sub1:sreg_64 = S_MOV_B32 1 +/// %0.sub0:sreg_64 = S_MOV_B32 2 +/// => +/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001 +/// +/// This is to allow rematerialization of a value instead of spilling. It is +/// supposed to be done after register coalescer to allow it to do its job and +/// before actual register allocation to allow rematerialization. +/// +/// Right now the pass only handles 64 bit SGPRs with immediate initializers, +/// although the same shall be possible with other register classes and +/// instructions if necessary. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-pre-ra-optimizations" + +namespace { + +class GCNPreRAOptimizations : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + + bool processReg(Register Reg); + +public: + static char ID; + + GCNPreRAOptimizations() : MachineFunctionPass(ID) { + initializeGCNPreRAOptimizationsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "AMDGPU Pre-RA optimizations"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNPreRAOptimizations, DEBUG_TYPE, + "AMDGPU Pre-RA optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(GCNPreRAOptimizations, DEBUG_TYPE, "Pre-RA optimizations", + false, false) + +char GCNPreRAOptimizations::ID = 0; + +char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizations::ID; + +FunctionPass *llvm::createGCNPreRAOptimizationsPass() { + return new GCNPreRAOptimizations(); +} + +bool GCNPreRAOptimizations::processReg(Register Reg) { + MachineInstr *Def0 = nullptr; + MachineInstr *Def1 = nullptr; + uint64_t Init = 0; + + for (MachineInstr &I : MRI->def_instructions(Reg)) { + if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg || + !I.getOperand(1).isImm() || I.getNumOperands() != 2) + return false; + + switch (I.getOperand(0).getSubReg()) { + default: + return false; + case AMDGPU::sub0: + if (Def0) + return false; + Def0 = &I; + Init |= I.getOperand(1).getImm() & 0xffffffff; + break; + case AMDGPU::sub1: + if (Def1) + return false; + Def1 = &I; + Init |= static_cast(I.getOperand(1).getImm()) << 32; + break; + } + } + + if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent()) + return false; + + LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1 + << " =>\n"); + + if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1), + LIS->getInstructionIndex(*Def0))) + std::swap(Def0, Def1); + + LIS->RemoveMachineInstrFromMaps(*Def0); + LIS->RemoveMachineInstrFromMaps(*Def1); + auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(), + TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg) + .addImm(Init); + + Def0->eraseFromParent(); + Def1->eraseFromParent(); + LIS->InsertMachineInstrInMaps(*NewI); + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + + LLVM_DEBUG(dbgs() << " " << *NewI); + + return true; +} + +bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + LIS = &getAnalysis(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + bool Changed = false; + + for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) { + Register Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg)) + continue; + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) + continue; + Changed |= processReg(Reg); + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1765,6 +1765,30 @@ expandMovDPP64(MI); break; } + case AMDGPU::S_MOV_B64_IMM_PSEUDO: { + const MachineOperand &SrcOp = MI.getOperand(1); + assert(!SrcOp.isFPImm()); + APInt Imm(64, SrcOp.getImm()); + if (Imm.isIntN(32) || isInlineConstant(Imm)) { + MI.setDesc(get(AMDGPU::S_MOV_B64)); + break; + } + + Register Dst = MI.getOperand(0).getReg(); + Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + + APInt Lo(32, Imm.getLoBits(32).getZExtValue()); + APInt Hi(32, Imm.getHiBits(32).getZExtValue()); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) + .addImm(Lo.getSExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) + .addImm(Hi.getSExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + MI.eraseFromParent(); + break; + } case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -111,6 +111,18 @@ let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. } +// 64-bit scalar move immediate instruction. This is used to avoid subregs +// initialization and allow rematerialization. +def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst), + (ins i64imm:$src0)> { + let isReMaterializable = 1; + let isAsCheapAsAMove = 1; + let isMoveImm = 1; + let SchedRW = [WriteSALU, Write64Bit]; + let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each. + let Uses = []; +} + // Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the // WQM pass processes it. def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -1097,11 +1097,11 @@ ; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: v_and_b32_e32 v3, s6, v3 +; SI-NEXT: s_movk_i32 s5, 0x80 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_movk_i32 s5, 0x80 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_and_b32_e32 v1, 1, v0 @@ -1129,11 +1129,11 @@ ; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: v_and_b32_e32 v3, s6, v3 +; VI-NEXT: s_movk_i32 s5, 0x80 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_movk_i32 s5, 0x80 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] ; VI-NEXT: v_and_b32_e32 v1, 1, v0 @@ -1165,10 +1165,10 @@ ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc ; SI-NEXT: v_and_b32_e32 v3, s4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_movk_i32 s5, 0x80 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_and_b32_e32 v1, 1, v0 @@ -1195,10 +1195,10 @@ ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc ; VI-NEXT: v_and_b32_e32 v3, s4, v3 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: s_movk_i32 s5, 0x80 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] ; VI-NEXT: v_and_b32_e32 v1, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2751,9 +2751,9 @@ ; GPRIDX-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 ; GPRIDX-NEXT: s_mov_b32 s0, 0 +; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s2, s0 -; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 @@ -2842,9 +2842,9 @@ ; MOVREL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 ; MOVREL-NEXT: s_mov_b32 s0, 0 +; MOVREL-NEXT: s_mov_b32 s1, 0x40140000 ; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_mov_b32 s2, s0 -; MOVREL-NEXT: s_mov_b32 s1, 0x40140000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 @@ -2935,9 +2935,9 @@ ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b32 s3, 0x40140000 ; GFX10-NEXT: s_mov_b32 s5, 0x40080000 ; GFX10-NEXT: s_mov_b32 s4, s2 -; GFX10-NEXT: s_mov_b32 s3, 0x40140000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s8, 1 @@ -3837,21 +3837,21 @@ ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8 -; GPRIDX-NEXT: s_mov_b32 s0, 0 -; GPRIDX-NEXT: s_mov_b32 s1, 0x40080000 +; GPRIDX-NEXT: s_mov_b32 s2, 0 +; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 ; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2 -; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GPRIDX-NEXT: s_cmp_eq_u32 s6, 3 -; GPRIDX-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 -; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 +; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_extract_v4f64_s_s_s: @@ -3924,21 +3924,21 @@ ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_load_dword s6, s[4:5], 0x8 -; MOVREL-NEXT: s_mov_b32 s0, 0 -; MOVREL-NEXT: s_mov_b32 s1, 0x40080000 +; MOVREL-NEXT: s_mov_b32 s2, 0 +; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 -; MOVREL-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s6, 3 -; MOVREL-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1] -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: v_mov_b32_e32 v1, s1 -; MOVREL-NEXT: v_mov_b32_e32 v3, s3 +; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; MOVREL-NEXT: v_mov_b32_e32 v0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm ; @@ -4078,8 +4078,7 @@ ; MOVREL-LABEL: v_extract_v64i32_32: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_movk_i32 s4, 0x80 -; MOVREL-NEXT: s_mov_b32 s5, 0 +; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80 ; MOVREL-NEXT: v_mov_b32_e32 v2, s4 ; MOVREL-NEXT: v_mov_b32_e32 v3, s5 ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 @@ -4112,8 +4111,7 @@ ; MOVREL-LABEL: v_extract_v64i32_33: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_movk_i32 s4, 0x80 -; MOVREL-NEXT: s_mov_b32 s5, 0 +; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80 ; MOVREL-NEXT: v_mov_b32_e32 v2, s4 ; MOVREL-NEXT: v_mov_b32_e32 v3, s5 ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 @@ -4140,8 +4138,7 @@ ; GPRIDX-LABEL: v_extract_v64i32_37: ; GPRIDX: ; %bb.0: ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_movk_i32 s4, 0x80 -; GPRIDX-NEXT: s_mov_b32 s5, 0 +; GPRIDX-NEXT: s_mov_b64 s[4:5], 0x80 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s5 ; GPRIDX-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -4154,8 +4151,7 @@ ; MOVREL-LABEL: v_extract_v64i32_37: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_movk_i32 s4, 0x80 -; MOVREL-NEXT: s_mov_b32 s5, 0 +; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80 ; MOVREL-NEXT: v_mov_b32_e32 v2, s4 ; MOVREL-NEXT: v_mov_b32_e32 v3, s5 ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 @@ -4171,8 +4167,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0x80 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], 0x80 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll @@ -7,8 +7,8 @@ ; GFX6-LABEL: v_floor_f64_ieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -30,8 +30,8 @@ ; GFX6-LABEL: v_floor_f64_ieee_nnan: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] @@ -50,8 +50,8 @@ ; GFX6-LABEL: v_floor_f64_ieee_fneg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -74,8 +74,8 @@ ; GFX6-LABEL: v_floor_f64_nonieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -97,8 +97,8 @@ ; GFX6-LABEL: v_floor_f64_nonieee_nnan: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] @@ -117,8 +117,8 @@ ; GFX6-LABEL: v_floor_f64_non_ieee_fneg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -141,8 +141,8 @@ ; GFX6-LABEL: v_floor_f64_fabs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]| ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]| ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -170,8 +170,8 @@ ; GFX6-LABEL: v_floor_f64_fneg_fabs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]| ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]| ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -194,8 +194,8 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) { ; GFX6-LABEL: s_floor_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3] ; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3] ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] @@ -218,8 +218,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) { ; GFX6-LABEL: s_floor_f64_fneg: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3] ; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3] ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] @@ -243,8 +243,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) { ; GFX6-LABEL: s_floor_f64_fabs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]| ; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]| ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] @@ -268,8 +268,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) { ; GFX6-LABEL: s_floor_f64_fneg_fabs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]| ; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]| ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -4703,8 +4703,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: s_fshl_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s10, 0x7f -; GFX6-NEXT: s_mov_b32 s11, 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0x7f ; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] ; GFX6-NEXT: s_sub_i32 s9, s12, 64 @@ -4751,8 +4750,7 @@ ; ; GFX8-LABEL: s_fshl_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s10, 0x7f -; GFX8-NEXT: s_mov_b32 s11, 0 +; GFX8-NEXT: s_mov_b64 s[10:11], 0x7f ; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] ; GFX8-NEXT: s_sub_i32 s9, s12, 64 @@ -4799,8 +4797,7 @@ ; ; GFX9-LABEL: s_fshl_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s10, 0x7f -; GFX9-NEXT: s_mov_b32 s11, 0 +; GFX9-NEXT: s_mov_b64 s[10:11], 0x7f ; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] ; GFX9-NEXT: s_sub_i32 s9, s12, 64 @@ -4847,8 +4844,7 @@ ; ; GFX10-LABEL: s_fshl_i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s10, 0x7f -; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_mov_b64 s[10:11], 0x7f ; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] ; GFX10-NEXT: s_sub_i32 s9, s12, 64 @@ -5321,8 +5317,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshl_i128_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s6, 0x7f -; GFX6-NEXT: s_mov_b32 s7, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_i32 s5, s8, 64 @@ -5379,8 +5374,7 @@ ; ; GFX8-LABEL: v_fshl_i128_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s6, 0x7f -; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX8-NEXT: s_sub_i32 s5, s8, 64 @@ -5437,8 +5431,7 @@ ; ; GFX9-LABEL: v_fshl_i128_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s6, 0x7f -; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX9-NEXT: s_sub_i32 s5, s8, 64 @@ -5495,8 +5488,7 @@ ; ; GFX10-LABEL: v_fshl_i128_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s6, 0x7f -; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] @@ -5556,8 +5548,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshl_i128_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s6, 0x7f -; GFX6-NEXT: s_mov_b32 s7, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_i32 s6, 64, s8 @@ -5612,8 +5603,7 @@ ; ; GFX8-LABEL: v_fshl_i128_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s6, 0x7f -; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX8-NEXT: s_sub_i32 s6, 64, s8 @@ -5668,8 +5658,7 @@ ; ; GFX9-LABEL: v_fshl_i128_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s6, 0x7f -; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX9-NEXT: s_sub_i32 s6, 64, s8 @@ -5724,8 +5713,7 @@ ; ; GFX10-LABEL: v_fshl_i128_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s6, 0x7f -; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX10-NEXT: s_andn2_b64 s[6:7], s[6:7], s[4:5] ; GFX10-NEXT: s_sub_i32 s4, 64, s8 @@ -5902,8 +5890,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { ; GFX6-LABEL: s_fshl_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s18, 0x7f -; GFX6-NEXT: s_mov_b32 s19, 0 +; GFX6-NEXT: s_mov_b64 s[18:19], 0x7f ; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX6-NEXT: s_sub_i32 s17, s22, 64 @@ -5991,8 +5978,7 @@ ; ; GFX8-LABEL: s_fshl_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s18, 0x7f -; GFX8-NEXT: s_mov_b32 s19, 0 +; GFX8-NEXT: s_mov_b64 s[18:19], 0x7f ; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX8-NEXT: s_sub_i32 s17, s22, 64 @@ -6080,8 +6066,7 @@ ; ; GFX9-LABEL: s_fshl_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s18, 0x7f -; GFX9-NEXT: s_mov_b32 s19, 0 +; GFX9-NEXT: s_mov_b64 s[18:19], 0x7f ; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX9-NEXT: s_sub_i32 s17, s22, 64 @@ -6169,8 +6154,7 @@ ; ; GFX10-LABEL: s_fshl_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s18, 0x7f -; GFX10-NEXT: s_mov_b32 s19, 0 +; GFX10-NEXT: s_mov_b64 s[18:19], 0x7f ; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX10-NEXT: s_sub_i32 s17, s22, 64 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -4840,8 +4840,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: s_fshr_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s10, 0x7f -; GFX6-NEXT: s_mov_b32 s11, 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0x7f ; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] ; GFX6-NEXT: s_sub_i32 s9, 64, 1 @@ -4888,8 +4887,7 @@ ; ; GFX8-LABEL: s_fshr_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s10, 0x7f -; GFX8-NEXT: s_mov_b32 s11, 0 +; GFX8-NEXT: s_mov_b64 s[10:11], 0x7f ; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] ; GFX8-NEXT: s_sub_i32 s9, 64, 1 @@ -4936,8 +4934,7 @@ ; ; GFX9-LABEL: s_fshr_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s10, 0x7f -; GFX9-NEXT: s_mov_b32 s11, 0 +; GFX9-NEXT: s_mov_b64 s[10:11], 0x7f ; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] ; GFX9-NEXT: s_sub_i32 s9, 64, 1 @@ -4984,8 +4981,7 @@ ; ; GFX10-LABEL: s_fshr_i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s10, 0x7f -; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_mov_b64 s[10:11], 0x7f ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX10-NEXT: s_sub_i32 s13, 64, 1 @@ -5458,8 +5454,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshr_i128_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s6, 0x7f -; GFX6-NEXT: s_mov_b32 s7, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_i32 s5, 64, 1 @@ -5515,8 +5510,7 @@ ; ; GFX8-LABEL: v_fshr_i128_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s6, 0x7f -; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX8-NEXT: s_sub_i32 s5, 64, 1 @@ -5572,8 +5566,7 @@ ; ; GFX9-LABEL: v_fshr_i128_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s6, 0x7f -; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX9-NEXT: s_sub_i32 s5, 64, 1 @@ -5629,8 +5622,7 @@ ; ; GFX10-LABEL: v_fshr_i128_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s6, 0x7f -; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX10-NEXT: s_sub_i32 s9, 64, 1 @@ -5689,8 +5681,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshr_i128_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s6, 0x7f -; GFX6-NEXT: s_mov_b32 s7, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_i32 s5, 64, 1 @@ -5746,8 +5737,7 @@ ; ; GFX8-LABEL: v_fshr_i128_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s6, 0x7f -; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX8-NEXT: s_sub_i32 s5, 64, 1 @@ -5803,8 +5793,7 @@ ; ; GFX9-LABEL: v_fshr_i128_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s6, 0x7f -; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX9-NEXT: s_sub_i32 s5, 64, 1 @@ -5863,19 +5852,18 @@ ; GFX10-NEXT: s_sub_i32 s6, 64, 1 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX10-NEXT: s_movk_i32 s6, 0x7f -; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: s_andn2_b64 s[8:9], s[6:7], s[4:5] ; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX10-NEXT: s_sub_i32 s4, 64, s8 ; GFX10-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX10-NEXT: s_sub_i32 s4, 64, s8 ; GFX10-NEXT: s_sub_i32 s5, s8, 64 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 @@ -6044,8 +6032,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s18, 0x7f -; GFX6-NEXT: s_mov_b32 s19, 0 +; GFX6-NEXT: s_mov_b64 s[18:19], 0x7f ; GFX6-NEXT: s_sub_i32 s28, 64, 1 ; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] @@ -6133,8 +6120,7 @@ ; ; GFX8-LABEL: s_fshr_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s18, 0x7f -; GFX8-NEXT: s_mov_b32 s19, 0 +; GFX8-NEXT: s_mov_b64 s[18:19], 0x7f ; GFX8-NEXT: s_sub_i32 s28, 64, 1 ; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] @@ -6222,8 +6208,7 @@ ; ; GFX9-LABEL: s_fshr_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s18, 0x7f -; GFX9-NEXT: s_mov_b32 s19, 0 +; GFX9-NEXT: s_mov_b64 s[18:19], 0x7f ; GFX9-NEXT: s_sub_i32 s28, 64, 1 ; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] @@ -6311,13 +6296,12 @@ ; ; GFX10-LABEL: s_fshr_v2i128: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b64 s[18:19], 0x7f ; GFX10-NEXT: s_sub_i32 s28, 64, 1 -; GFX10-NEXT: s_movk_i32 s18, 0x7f -; GFX10-NEXT: s_mov_b32 s19, 0 -; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s28 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] +; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s28 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[24:25], s[2:3] ; GFX10-NEXT: s_sub_i32 s23, s16, 64 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -19,8 +19,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_movk_i32 s4, 0x1000 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0x1000 ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -50,8 +49,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_movk_i32 s4, 0x1000 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0x1000 ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -336,8 +336,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s4, 0x400 -; GFX6-NEXT: s_mov_b32 s5, 0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -352,8 +351,7 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_movk_i32 s4, 0x400 -; GFX7-NEXT: s_mov_b32 s5, 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -792,8 +790,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s4, 0x400 -; GFX6-NEXT: s_mov_b32 s5, 0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 @@ -808,8 +805,7 @@ ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_movk_i32 s4, 0x400 -; GFX7-NEXT: s_mov_b32 s5, 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -568,8 +568,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, 0 ; GFX6-NEXT: v_or_b32_e32 v4, 0x43300000, v4 ; GFX6-NEXT: v_add_f64 v[5:6], -v[0:1], v[3:4] -; GFX6-NEXT: v_mov_b32_e32 v1, v0 ; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 ; GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; GFX6-NEXT: v_add_f64 v[3:4], v[5:6], -v[3:4] ; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[1:2]|, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2509,8 +2509,7 @@ ; CHECK-LABEL: v_sdiv_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1000 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 @@ -2703,8 +2702,7 @@ ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s6, 0x1000 -; GISEL-NEXT: s_mov_b32 s7, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 @@ -2996,8 +2994,7 @@ ; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0 +; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 ; CGP-NEXT: v_mov_b32_e32 v7, v1 ; CGP-NEXT: v_mov_b32_e32 v5, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -2473,8 +2473,7 @@ ; CHECK-LABEL: v_srem_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1000 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 @@ -2663,8 +2662,7 @@ ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s6, 0x1000 -; GISEL-NEXT: s_mov_b32 s7, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 @@ -2952,8 +2950,7 @@ ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0 +; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 ; CGP-NEXT: v_mov_b32_e32 v7, v1 ; CGP-NEXT: v_mov_b32_e32 v5, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2291,8 +2291,7 @@ ; CHECK-LABEL: v_udiv_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1000 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 ; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 @@ -2470,8 +2469,7 @@ ; GISEL-LABEL: v_udiv_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: s_mov_b32 s5, 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 ; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 @@ -2735,8 +2733,7 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_mov_b32_e32 v5, v0 ; CGP-NEXT: v_mov_b32_e32 v7, v1 -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0 +; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -1651,8 +1651,7 @@ ; CHECK-LABEL: v_urem_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1000 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 ; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 @@ -1827,8 +1826,7 @@ ; GISEL-LABEL: v_urem_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: s_mov_b32 s5, 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 ; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 @@ -2090,8 +2088,7 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_mov_b32_e32 v5, v0 ; CGP-NEXT: v_mov_b32_e32 v7, v1 -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0 +; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -67,6 +67,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c @@ -157,6 +158,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c @@ -268,6 +270,7 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c @@ -383,6 +386,7 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c @@ -464,6 +468,7 @@ ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -534,6 +539,7 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -612,6 +618,7 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -696,6 +703,7 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -768,6 +776,7 @@ ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -835,6 +844,7 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -912,6 +922,7 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -997,6 +1008,7 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -1249,6 +1261,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -1542,6 +1555,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -1915,6 +1929,7 @@ ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -2308,6 +2323,7 @@ ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s5, v3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 @@ -2567,6 +2583,7 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -2791,6 +2808,7 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -3043,6 +3061,7 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -3322,6 +3341,7 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -3455,6 +3475,7 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -3528,6 +3549,7 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -3609,6 +3631,7 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -3696,6 +3719,7 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -3843,6 +3867,7 @@ ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -4023,6 +4048,7 @@ ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -4219,6 +4245,7 @@ ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -4438,6 +4465,7 @@ ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -4631,6 +4659,7 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -4827,6 +4856,7 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5043,6 +5073,7 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5285,6 +5316,7 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5394,6 +5426,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i32_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5430,6 +5463,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5464,6 +5498,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5505,6 +5540,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5550,6 +5586,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5694,6 +5731,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -5776,6 +5814,7 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i32_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5814,6 +5853,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5849,6 +5889,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5892,6 +5933,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -6025,6 +6067,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -6100,6 +6143,7 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i32_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -6139,6 +6183,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -6202,6 +6247,7 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c @@ -6275,6 +6321,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -6329,6 +6376,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -6511,6 +6559,7 @@ ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -6610,6 +6659,7 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i32_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -6652,6 +6702,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -6713,6 +6764,7 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c @@ -6787,6 +6839,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -6960,6 +7013,7 @@ ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -7159,6 +7213,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i64_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73 @@ -7303,6 +7358,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -7340,6 +7396,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -7384,6 +7441,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -7525,6 +7583,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 @@ -7672,6 +7731,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -7824,6 +7884,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i64_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 @@ -7967,6 +8028,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -7997,8 +8059,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mov_b32 s5, 0 -; GFX6-NEXT: s_movk_i32 s4, 0x1000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; GFX6-NEXT: s_add_u32 s4, s4, -1 ; GFX6-NEXT: s_addc_u32 s5, s5, -1 @@ -8007,12 +8068,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 @@ -8056,6 +8117,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -8094,8 +8156,7 @@ ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GFX6-NEXT: s_mov_b32 s13, 0 -; GFX6-NEXT: s_movk_i32 s12, 0x1000 +; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8113,13 +8174,13 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 -; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 @@ -8267,6 +8328,7 @@ ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i64_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 @@ -8410,6 +8472,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -8439,10 +8502,10 @@ ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0 -; GFX6-NEXT: s_movk_i32 s2, 0x1000 +; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX6-NEXT: s_ashr_i32 s12, s3, 31 @@ -8458,7 +8521,6 @@ ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -8576,11 +8638,11 @@ ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x1000 +; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -8753,6 +8815,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -8917,6 +8980,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 @@ -9064,10 +9128,10 @@ ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 -; GFX6-NEXT: s_mov_b32 s3, 0 -; GFX6-NEXT: s_movk_i32 s2, 0x1000 +; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 ; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc +; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -9078,12 +9142,11 @@ ; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 ; GFX6-NEXT: s_sub_u32 s6, 0, s14 +; GFX6-NEXT: s_subb_u32 s7, 0, s15 ; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_subb_u32 s7, 0, s15 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 @@ -9330,13 +9393,14 @@ ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x1000 +; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX9-NEXT: s_mov_b32 s18, 0x4f800000 ; GFX9-NEXT: s_mov_b32 s19, 0x5f7ffffc +; GFX9-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -9347,12 +9411,11 @@ ; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX9-NEXT: s_mov_b32 s21, 0xcf800000 ; GFX9-NEXT: s_sub_u32 s14, 0, s10 +; GFX9-NEXT: s_subb_u32 s4, 0, s11 ; GFX9-NEXT: v_mac_f32_e32 v0, s18, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_subb_u32 s4, 0, s11 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, s19, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s20, v0 @@ -9727,6 +9790,7 @@ ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i64_oddk_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 @@ -9870,6 +9934,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -9901,10 +9966,10 @@ ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0 -; GFX6-NEXT: s_movk_i32 s2, 0x1000 +; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 @@ -9920,15 +9985,14 @@ ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s4, s8 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 @@ -10036,11 +10100,11 @@ ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x1000 +; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -10216,6 +10280,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -10265,10 +10330,10 @@ ; GFX6-LABEL: srem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 -; GFX6-NEXT: s_mov_b32 s3, 0 -; GFX6-NEXT: s_movk_i32 s2, 0x1000 +; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 ; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc +; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -10279,12 +10344,11 @@ ; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 -; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 ; GFX6-NEXT: s_sub_u32 s6, 0, s16 +; GFX6-NEXT: s_subb_u32 s7, 0, s17 ; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_subb_u32 s7, 0, s17 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 @@ -10527,13 +10591,14 @@ ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm +; ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x1000 +; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 ; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc +; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -10544,12 +10609,11 @@ ; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 ; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 ; GFX9-NEXT: s_sub_u32 s4, 0, s14 +; GFX9-NEXT: s_subb_u32 s5, 0, s15 ; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_subb_u32 s5, 0, s15 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 diff --git a/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir b/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir @@ -0,0 +1,98 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass=liveintervals,amdgpu-pre-ra-optimizations %s -o - | FileCheck -check-prefix=GCN %s + +--- +# GCN-LABEL: name: combine_sreg64_inits +# GCN: %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593 +# GCN: S_NOP 0 +name: combine_sreg64_inits +tracksRegLiveness: true +body: | + bb.0: + undef %0.sub0:sgpr_64 = S_MOV_B32 1 + S_NOP 0 + %0.sub1:sgpr_64 = S_MOV_B32 2 +... +--- +# GCN-LABEL: name: combine_sreg64_inits_swap +# GCN: %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593 +# GCN: S_NOP 0 +name: combine_sreg64_inits_swap +tracksRegLiveness: true +body: | + bb.0: + undef %0.sub1:sgpr_64 = S_MOV_B32 2 + S_NOP 0 + %0.sub0:sgpr_64 = S_MOV_B32 1 +... +--- +# GCN-LABEL: name: sreg64_inits_different_blocks +# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1 +# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2 +name: sreg64_inits_different_blocks +tracksRegLiveness: true +body: | + bb.0: + undef %0.sub0:sgpr_64 = S_MOV_B32 1 + + bb.1: + %0.sub1:sgpr_64 = S_MOV_B32 2 +... +--- +# GCN-LABEL: name: sreg64_inits_two_defs_sub1 +# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1 +# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2 +# GCN: %0.sub1:sgpr_64 = S_MOV_B32 3 +name: sreg64_inits_two_defs_sub1 +tracksRegLiveness: true +body: | + bb.0: + undef %0.sub0:sgpr_64 = S_MOV_B32 1 + %0.sub1:sgpr_64 = S_MOV_B32 2 + %0.sub1:sgpr_64 = S_MOV_B32 3 +... +--- +# GCN-LABEL: name: sreg64_inits_two_defs_sub0 +# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1 +# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2 +# GCN: %0.sub0:sgpr_64 = S_MOV_B32 3 +name: sreg64_inits_two_defs_sub0 +tracksRegLiveness: true +body: | + bb.0: + undef %0.sub0:sgpr_64 = S_MOV_B32 1 + %0.sub1:sgpr_64 = S_MOV_B32 2 + %0.sub0:sgpr_64 = S_MOV_B32 3 +... +--- +# GCN-LABEL: name: sreg64_inits_full_def +# GCN: undef %1.sub0:sgpr_64 = S_MOV_B32 1 +# GCN: %0:sgpr_64 = S_MOV_B64 3 +name: sreg64_inits_full_def +tracksRegLiveness: true +body: | + bb.0: + undef %0.sub0:sgpr_64 = S_MOV_B32 1 + %0:sgpr_64 = S_MOV_B64 3 +... +--- +# GCN-LABEL: name: sreg64_inits_imp_use +# GCN: %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0 +# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2 +name: sreg64_inits_imp_use +tracksRegLiveness: true +body: | + bb.0: + undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0 + %0.sub1:sgpr_64 = S_MOV_B32 2 +... +--- +# GCN-LABEL: name: sreg64_inits_imp_def +# GCN: %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc +# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2 +name: sreg64_inits_imp_def +tracksRegLiveness: true +body: | + bb.0: + undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc + %0.sub1:sgpr_64 = S_MOV_B32 2 +... diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -844,8 +844,7 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { ; CI-LABEL: store_misaligned64_constant_large_offsets: ; CI: ; %bb.0: -; CI-NEXT: s_movk_i32 s0, 0x7b -; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: s_mov_b64 s[0:1], 0x7b ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -856,8 +855,7 @@ ; ; GFX9-LABEL: store_misaligned64_constant_large_offsets: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s0, 0x7b -; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll --- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll @@ -205,8 +205,7 @@ ; FIXME: Should not have intermediate sgprs ; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr: -; CHECK-DAG: s_mov_b32 s1, 0 -; CHECK-DAG: s_mov_b32 s0, 0x1e240 +; CHECK: s_mov_b64 s[0:1], 0x1e240 ; CHECK: v_mov_b32_e32 v0, s0 ; CHECK: v_mov_b32_e32 v1, s1 ; CHECK: use v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll --- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll @@ -59,20 +59,17 @@ ret void } -; FIXME: Should be able to use s_mov_b64 ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64: -; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}} -; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}} -; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}} +; GCN: s_mov_b64 [[REG:s\[[0-9:]+\]]], -4{{$}} +; GCN: ; use [[REG]] define amdgpu_kernel void @inline_sreg_constraint_imm_i64() { tail call void asm sideeffect "; use $0", "s"(i64 -4) ret void } ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f64: -; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}} -; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}} -; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}} +; GCN: s_mov_b64 [[REG:s\[[0-9:]+\]]], 1.0{{$}} +; GCN: ; use [[REG]] define amdgpu_kernel void @inline_sreg_constraint_imm_f64() { tail call void asm sideeffect "; use $0", "s"(double 1.0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -900,12 +900,11 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s8, s4, 4 -; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: s_mov_b64 s[4:5], 0xffff ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; SI-NEXT: s_mov_b32 s8, 0x50005 ; SI-NEXT: s_and_b32 s9, s5, s8 @@ -923,12 +922,11 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s5, 0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s8, s4, 4 -; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: s_mov_b64 s[4:5], 0xffff ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 ; VI-NEXT: s_mov_b32 s8, 0x50005 ; VI-NEXT: s_mov_b32 s9, s8 @@ -1075,14 +1073,13 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s7, 0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_lshl_b32 s8, s6, 3 -; SI-NEXT: s_mov_b32 s6, 0xffff +; SI-NEXT: s_mov_b64 s[6:7], 0xffff ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 ; SI-NEXT: s_mov_b32 s8, 0x5050505 ; SI-NEXT: s_mov_b32 s1, s9 @@ -1100,14 +1097,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; VI-NEXT: s_load_dword s6, s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s7, 0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s0, s8 ; VI-NEXT: s_lshl_b32 s8, s6, 3 -; VI-NEXT: s_mov_b32 s6, 0xffff +; VI-NEXT: s_mov_b64 s[6:7], 0xffff ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 ; VI-NEXT: s_mov_b32 s8, 0x5050505 ; VI-NEXT: s_mov_b32 s1, s9 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1584,8 +1584,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6 @@ -1607,9 +1606,8 @@ ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s2, 0xffff +; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_and_b32 s1, s4, s2 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshl_b32 s0, s1, 16 @@ -1635,8 +1633,7 @@ ; CI-NEXT: flat_load_dword v4, v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_mov_b32 s3, 0 -; CI-NEXT: s_mov_b32 s2, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_lshl_b32 s1, s4, 16 ; CI-NEXT: s_and_b32 s4, s4, s2 @@ -1672,8 +1669,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff ; GFX9-NEXT: s_lshl_b32 s4, s7, 4 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 @@ -1694,9 +1690,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s2, 0xffff +; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_lshl_b32 s1, s5, 4 ; VI-NEXT: s_and_b32 s4, s4, s2 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 @@ -1722,10 +1717,9 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_mov_b32 s2, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_and_b32 s6, s4, s2 -; CI-NEXT: s_mov_b32 s3, 0 ; CI-NEXT: s_lshl_b32 s1, s5, 4 ; CI-NEXT: s_lshl_b32 s4, s4, 16 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -7,7 +7,7 @@ ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \ ; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O1 %s ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -amdgpu-scalar-ir-passes -amdgpu-sdwa-peephole \ -; RUN: -amdgpu-load-store-vectorizer -debug-pass=Structure < %s 2>&1 \ +; RUN: -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -debug-pass=Structure < %s 2>&1 \ ; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O1-OPTS %s ; RUN: llc -O2 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \ ; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O2 %s @@ -619,6 +619,7 @@ ; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: Simple Register Coalescing ; GCN-O1-OPTS-NEXT: Rename Disconnected Subregister Components +; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA optimizations ; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler ; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI Whole Quad Mode @@ -899,6 +900,7 @@ ; GCN-O2-NEXT: Machine Natural Loop Construction ; GCN-O2-NEXT: Simple Register Coalescing ; GCN-O2-NEXT: Rename Disconnected Subregister Components +; GCN-O2-NEXT: AMDGPU Pre-RA optimizations ; GCN-O2-NEXT: Machine Instruction Scheduler ; GCN-O2-NEXT: MachinePostDominator Tree Construction ; GCN-O2-NEXT: SI Whole Quad Mode @@ -1193,6 +1195,7 @@ ; GCN-O3-NEXT: Machine Natural Loop Construction ; GCN-O3-NEXT: Simple Register Coalescing ; GCN-O3-NEXT: Rename Disconnected Subregister Components +; GCN-O3-NEXT: AMDGPU Pre-RA optimizations ; GCN-O3-NEXT: Machine Instruction Scheduler ; GCN-O3-NEXT: MachinePostDominator Tree Construction ; GCN-O3-NEXT: SI Whole Quad Mode diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -77,9 +77,9 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s0, -1 ; SI-NEXT: s_movk_i32 s7, 0xfc01 ; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, -1 ; SI-NEXT: s_brev_b32 s6, -2 ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -92,9 +92,8 @@ ; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0: ; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 0 -; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 1.0 -; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[LO]]:[[HI]]]{{$}} +; GFX90A-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000 +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]] define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -103,8 +103,7 @@ ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4 ; GCN-NOT: buffer_ -; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0 -; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff +; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff ; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2 ; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca @@ -163,8 +162,7 @@ ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4 ; GCN-NOT: buffer_ -; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0 -; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff +; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff ; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2 ; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll @@ -0,0 +1,45 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_remat_sgpr: +; GCN-NOT: v_writelane_b32 +; GCN: {{^}}[[LOOP:BB[0-9_]+]]: +; GCN-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x +; GCN-NOT: v_writelane_b32 +; GCN: s_cbranch_{{[^ ]+}} [[LOOP]] +; GCN: .sgpr_spill_count: 0 +define amdgpu_kernel void @test_remat_sgpr(double addrspace(1)* %arg, double addrspace(1)* %arg1) { +bb: + %i = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i4 = phi i32 [ 0, %bb ], [ %i22, %bb3 ] + %i5 = add nuw nsw i32 %i4, %i + %i6 = zext i32 %i5 to i64 + %i7 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %i6 + %i8 = load double, double addrspace(1)* %i7, align 8 + %i9 = fadd double %i8, 0x3EFC01997CC9E6B0 + %i10 = tail call double @llvm.fma.f64(double %i8, double %i9, double 0x3FBE25E43ABE935A) + %i11 = tail call double @llvm.fma.f64(double %i10, double %i9, double 0x3FC110EF47E6C9C2) + %i12 = tail call double @llvm.fma.f64(double %i11, double %i9, double 0x3FC3B13BCFA74449) + %i13 = tail call double @llvm.fma.f64(double %i12, double %i9, double 0x3FC745D171BF3C30) + %i14 = tail call double @llvm.fma.f64(double %i13, double %i9, double 0x3FCC71C71C7792CE) + %i15 = tail call double @llvm.fma.f64(double %i14, double %i9, double 0x3FD24924924920DA) + %i16 = tail call double @llvm.fma.f64(double %i15, double %i9, double 0x3FD999999999999C) + %i17 = tail call double @llvm.fma.f64(double %i16, double %i9, double 0x3FD899999999899C) + %i18 = tail call double @llvm.fma.f64(double %i17, double %i9, double 0x3FD799999999799C) + %i19 = tail call double @llvm.fma.f64(double %i18, double %i9, double 0x3FD699999999699C) + %i20 = tail call double @llvm.fma.f64(double %i19, double %i9, double 0x3FD599999999599C) + %i21 = getelementptr inbounds double, double addrspace(1)* %arg1, i64 %i6 + store double %i19, double addrspace(1)* %i21, align 8 + %i22 = add nuw nsw i32 %i4, 1 + %i23 = icmp eq i32 %i22, 1024 + br i1 %i23, label %bb2, label %bb3 +} + +declare double @llvm.fma.f64(double, double, double) +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1698,7 +1698,7 @@ ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_mov_b32 s8, 0x8000 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] @@ -1706,7 +1706,6 @@ ; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1724,8 +1723,7 @@ ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_mov_b32 s5, 0 -; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v8 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -146,8 +146,7 @@ ; GCN-LABEL: v_lshr_i128_kv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_movk_i32 s4, 0x41 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 ; GCN-NEXT: v_mov_b32_e32 v3, s4 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -1248,8 +1248,8 @@ ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_movk_i32 s7, 0x11e ; SI-NEXT: s_mov_b32 s6, 0xab19b207 +; SI-NEXT: s_movk_i32 s7, 0x11e ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1266,8 +1266,8 @@ ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_movk_i32 s1, 0x11e ; VI-NEXT: s_mov_b32 s0, 0xab19b207 +; VI-NEXT: s_movk_i32 s1, 0x11e ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1319,8 +1319,7 @@ ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_mov_b32 s6, 0x12d687 +; SI-NEXT: s_mov_b64 s[6:7], 0x12d687 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1337,8 +1336,7 @@ ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: s_mov_b32 s0, 0x12d687 +; VI-NEXT: s_mov_b64 s[0:1], 0x12d687 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1927,8 +1925,7 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s1, 0 -; SI-NEXT: s_mov_b32 s0, 4.0 +; SI-NEXT: s_mov_b64 s[0:1], 0x40800000 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1942,8 +1939,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: s_mov_b32 s0, 4.0 +; VI-NEXT: s_mov_b64 s[0:1], 0x40800000 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2039,8 +2035,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s1, 4.0 ; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 4.0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2054,8 +2050,8 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s1, 4.0 ; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s1, 4.0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2090,8 +2086,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s1, -4.0 ; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, -4.0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2105,8 +2101,8 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s1, -4.0 ; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s1, -4.0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll --- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll @@ -589,13 +589,13 @@ ; GCN-LABEL: {{^}}br_scc_eq_i64_simm16: ; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2 -; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0 +; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; SI: v_cmp_eq_u64_e32 define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { entry: - %cmp0 = icmp eq i64 %cond, 1234 + %cmp0 = icmp eq i64 %cond, 4294968530 br i1 %cmp0, label %endif, label %if if: @@ -627,13 +627,13 @@ ; GCN-LABEL: {{^}}br_scc_ne_i64_simm16: ; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2 -; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0 +; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; SI: v_cmp_ne_u64_e32 define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { entry: - %cmp0 = icmp ne i64 %cond, 1234 + %cmp0 = icmp ne i64 %cond, 4294968530 br i1 %cmp0, label %endif, label %if if: diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1876,14 +1876,13 @@ ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_mov_b32 s8, 0x8000 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1901,8 +1900,7 @@ ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_mov_b32 s5, 0 -; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1249,14 +1249,13 @@ ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_mov_b32 s8, 0x8000 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v9 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1274,8 +1273,7 @@ ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_mov_b32 s5, 0 -; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1269,14 +1269,13 @@ ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_mov_b32 s8, 0x8000 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1294,8 +1293,7 @@ ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_mov_b32 s5, 0 -; GCN-IR-NEXT: s_mov_b32 s4, 0x8000 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -807,8 +807,8 @@ ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16 -; GFX9-O3-NEXT: s_brev_b32 s9, -2 ; GFX9-O3-NEXT: s_mov_b32 s8, -1 +; GFX9-O3-NEXT: s_brev_b32 s9, -2 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) ; GFX9-O3-NEXT: v_mov_b32_e32 v1, s8