diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -63,6 +63,7 @@ FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); +FunctionPass *createSIFixRenamableFlagsPass(); FunctionPass *createSIPostRABundlerPass(); FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *); @@ -177,6 +178,9 @@ void initializeSIPreAllocateWWMRegsPass(PassRegistry &); extern char &SIPreAllocateWWMRegsID; +void initializeSIFixRenamableFlagsPass(PassRegistry &); +extern char &SIFixRenamableFlagsID; + void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &); extern char &AMDGPUSimplifyLibCallsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -253,6 +253,7 @@ initializeSIOptimizeExecMaskingPass(*PR); initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); + initializeSIFixRenamableFlagsPass(*PR); initializeSIPostRABundlerPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); @@ -671,6 +672,7 @@ void addOptimizedRegAlloc() override; void addPreRegAlloc() override; bool addPreRewrite() override; + void addPostRewrite() override; void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; @@ -1025,10 +1027,15 @@ return true; } +void GCNPassConfig::addPostRewrite() { + addPass(&SIFixRenamableFlagsID); +} + void GCNPassConfig::addPostRegAlloc() { addPass(&SIFixVGPRCopiesID); if (getOptLevel() > CodeGenOpt::None) addPass(&SIOptimizeExecMaskingID); + TargetPassConfig::addPostRegAlloc(); // Equivalent of PEI for SGPRs. diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -108,6 +108,7 @@ R600RegisterInfo.cpp SIAddIMGInit.cpp SIAnnotateControlFlow.cpp + SIFixRenamableFlags.cpp SIFixSGPRCopies.cpp SIFixVGPRCopies.cpp SIPreAllocateWWMRegs.cpp diff --git a/llvm/lib/Target/AMDGPU/SIFixRenamableFlags.cpp b/llvm/lib/Target/AMDGPU/SIFixRenamableFlags.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIFixRenamableFlags.cpp @@ -0,0 +1,140 @@ +//===- SIFixRenamableFlags.cpp - Fix Renamable Flags Post-RA ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Pass to remove renamable flags which could cause Machine Copy Progation +/// to generate constant bus violations or bank conflicts. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-renamable-flags" + +namespace { + +class SIFixRenamableFlags : public MachineFunctionPass { +public: + static char ID; + + SIFixRenamableFlags() : MachineFunctionPass(ID) { + initializeSIFixRenamableFlagsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixRenamableFlags, DEBUG_TYPE, + "SI Fix Renamable Flags", false, false) +INITIALIZE_PASS_END(SIFixRenamableFlags, DEBUG_TYPE, + "SI Fix Renamable Flags", false, false) + +char SIFixRenamableFlags::ID = 0; + +char &llvm::SIFixRenamableFlagsID = SIFixRenamableFlags::ID; + +FunctionPass *llvm::createSIFixRenamableFlagsPass() { + return new SIFixRenamableFlags(); +} + +bool SIFixRenamableFlags::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + LLVM_DEBUG(dbgs() << "SIFixRenamableFlags: function " << MF.getName() + << "\n"); + + const GCNSubtarget &ST = MF.getSubtarget(); + const bool HasRegisterBanking = ST.hasRegisterBanking(); + + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); + const SIRegisterInfo &RI = TII->getRegisterInfo(); + + bool Changed = false; + + for (auto &MBB : MF) { + for (auto &MI : MBB) { + // Only check VALUs + if (!(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOP3(MI) || + TII->isVOPC(MI) || TII->isSDWA(MI) || TII->isVALU(MI))) + continue; + + std::pair Usage = TII->constantBusUseCount(*MRI, MI); + unsigned ConstantBusUses = Usage.first; + + // Count VGPR usage (treating AGPRs as VGPRs) + SmallVector VGPRs; + for (auto &Use : MI.uses()) { + if (Use.isReg()) { + Register Reg = Use.getReg(); + if (RI.isVGPR(*MRI, Reg) || RI.isAGPR(*MRI, Reg)) { + if (llvm::all_of(VGPRs, [Reg](unsigned VGPR) { + return VGPR != Reg; + })) { + VGPRs.push_back(Reg); + } + } + } + } + + // Machine Copy Propagation can change a VGPR to SGPR and increase + // constant bus usage. + // If there is not enough constant bus capacity to support this + // then we need to disable renaming. + unsigned ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); + unsigned FreeBusCapacity = ConstantBusLimit - ConstantBusUses; + + // Check for free constant bus capacity to handle renaming. + // If register banking then remove renaming for multiple VGPRs to avoid + // conflicts. + if ((VGPRs.size() <= FreeBusCapacity) && + (!HasRegisterBanking || VGPRs.size() < 2)) + continue; + + LLVM_DEBUG(dbgs() << "Disable renaming for " + << "(" << FreeBusCapacity << ", " << VGPRs.size() + << "): " << MI); + + // Insufficient bus capacity to handle VGPR->SGPR renaming, + // disable renaming for VGPRs in this instruction. + + for (auto &Use : MI.uses()) { + if (Use.isReg() && Use.isRenamable()) { + Register Reg = Use.getReg(); + if (RI.isVGPR(*MRI, Reg) || RI.isAGPR(*MRI, Reg)) { + Use.setIsRenamable(false); + Changed = true; + } + } + } + } + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -205,9 +205,6 @@ let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1); let AsmVariantName = AMDGPUAsmVariants.Default; - - // Avoid changing source registers in a way that violates constant bus read limitations. - let hasExtraSrcRegAllocReq = !if(VOP1,1,!if(VOP2,1,!if(VOP3,1,!if(VOPC,1,!if(SDWA,1, !if(VALU,1,0)))))); } class PseudoInstSI pattern = [], string asm = ""> diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -786,6 +786,11 @@ const MachineOperand &MO, const MCOperandInfo &OpInfo) const; + /// Return pair of bus use count and literal count for machine instruction. + std::pair + constantBusUseCount(const MachineRegisterInfo &MRI, + const MachineInstr &MI) const; + /// Return true if this instruction has any modifiers. /// e.g. src[012]_mod, omod, clamp. bool hasModifiers(unsigned Opcode) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3563,6 +3563,54 @@ SubReg.getReg() == SuperVec.getReg(); } +std::pair +SIInstrInfo::constantBusUseCount(const MachineRegisterInfo &MRI, + const MachineInstr &MI) const { + // Only look at the true operands. Only a real operand can use the constant + // bus, and we don't want to check pseudo-operands like the source modifier + // flags. + const uint16_t Opcode = MI.getOpcode(); + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + const int OpIndices[] = {Src0Idx, Src1Idx, Src2Idx}; + + unsigned ConstantBusCount = 0; + unsigned LiteralCount = 0; + + if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) + ++ConstantBusCount; + + SmallVector SGPRsUsed; + Register SGPRUsed = findImplicitSGPRRead(MI); + if (SGPRUsed != AMDGPU::NoRegister) { + ++ConstantBusCount; + SGPRsUsed.push_back(SGPRUsed); + } + + for (int OpIdx : OpIndices) { + if (OpIdx == -1) + break; + const MachineOperand &MO = MI.getOperand(OpIdx); + if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { + if (MO.isReg()) { + SGPRUsed = MO.getReg(); + if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { + return !RI.regsOverlap(SGPRUsed, SGPR); + })) { + ++ConstantBusCount; + SGPRsUsed.push_back(SGPRUsed); + } + } else { + ++ConstantBusCount; + ++LiteralCount; + } + } + } + + return std::pair(ConstantBusCount, LiteralCount); +} + bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); @@ -3808,47 +3856,13 @@ // Verify VOP*. Ignore multiple sgpr operands on writelane. if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { - // Only look at the true operands. Only a real operand can use the constant - // bus, and we don't want to check pseudo-operands like the source modifier - // flags. - const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; - - unsigned ConstantBusCount = 0; - unsigned LiteralCount = 0; - - if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) - ++ConstantBusCount; - - SmallVector SGPRsUsed; - Register SGPRUsed = findImplicitSGPRRead(MI); - if (SGPRUsed != AMDGPU::NoRegister) { - ++ConstantBusCount; - SGPRsUsed.push_back(SGPRUsed); - } - - for (int OpIdx : OpIndices) { - if (OpIdx == -1) - break; - const MachineOperand &MO = MI.getOperand(OpIdx); - if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { - if (MO.isReg()) { - SGPRUsed = MO.getReg(); - if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { - return !RI.regsOverlap(SGPRUsed, SGPR); - })) { - ++ConstantBusCount; - SGPRsUsed.push_back(SGPRUsed); - } - } else { - ++ConstantBusCount; - ++LiteralCount; - } - } - } + const std::pair Usage = constantBusUseCount(MRI, MI); + const unsigned ConstantBusCount = Usage.first; + const unsigned LiteralCount = Usage.second; + const GCNSubtarget &ST = MF->getSubtarget(); // v_writelane_b32 is an exception from constant bus restriction: // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const - if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && - Opcode != AMDGPU::V_WRITELANE_B32) { + if (ConstantBusCount > ST.getConstantBusLimit(Opcode)) { ErrInfo = "VOP* instruction violates constant bus restriction"; return false; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1045,7 +1045,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, 32 ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -8,7 +8,6 @@ ; GCN-LABEL: v_extract_v64i32_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s6, s33 @@ -16,7 +15,8 @@ ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: v_mov_b32_e32 v12, s5 ; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 +; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v0 +; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: v_mov_b32_e32 v11, s4 ; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc ; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 @@ -294,7 +294,6 @@ ; GCN-LABEL: v_extract_v128i16_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s6, s33 @@ -302,7 +301,8 @@ ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: v_mov_b32_e32 v12, s5 ; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 +; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v0 +; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: v_mov_b32_e32 v11, s4 ; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc ; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -1009,30 +1009,28 @@ ; MOVREL-NEXT: v_mov_b32_e32 v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; MOVREL-NEXT: s_mov_b32 s30, s18 -; MOVREL-NEXT: s_mov_b32 s31, s19 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 5, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s30, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s31, s0 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 4, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 6, v0 +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 4, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 7, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s30, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s31, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s30, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s31, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s30, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s31, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s30, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s31, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s30, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s31, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s30, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s31, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s30, s4 -; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s31, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s18, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s19, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s18, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s19, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s18, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s19, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s18, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s19, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s18, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s19, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s18, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s19, s4 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off ; MOVREL-NEXT: ; implicit-def: $vcc_hi diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll @@ -72,18 +72,18 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) { ; UNPACKED-LABEL: image_store_v4f16: ; UNPACKED: ; %bb.0: -; UNPACKED-NEXT: v_mov_b32_e32 v6, v1 -; UNPACKED-NEXT: v_mov_b32_e32 v1, v2 ; UNPACKED-NEXT: s_mov_b32 s0, s2 ; UNPACKED-NEXT: s_mov_b32 s1, s3 ; UNPACKED-NEXT: s_mov_b32 s2, s4 ; UNPACKED-NEXT: s_mov_b32 s3, s5 ; UNPACKED-NEXT: s_mov_b32 s4, s6 ; UNPACKED-NEXT: s_mov_b32 s5, s7 +; UNPACKED-NEXT: v_mov_b32_e32 v6, v1 +; UNPACKED-NEXT: v_mov_b32_e32 v1, v2 ; UNPACKED-NEXT: s_mov_b32 s6, s8 ; UNPACKED-NEXT: s_mov_b32 s7, s9 ; UNPACKED-NEXT: v_mov_b32_e32 v5, v0 -; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; UNPACKED-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm ; UNPACKED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -6118,25 +6118,25 @@ ; ; GFX10-LABEL: saddsat_i128_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v9, v2 +; GFX10-NEXT: v_add_co_u32_e64 v15, vcc_lo, v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: v_add_co_u32_e64 v15, vcc_lo, v5, s0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, s4 +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6] +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -6103,25 +6103,25 @@ ; ; GFX10-LABEL: ssubsat_i128_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v9, v2 +; GFX10-NEXT: v_sub_co_u32_e64 v15, vcc_lo, v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: v_sub_co_u32_e64 v15, vcc_lo, v5, s0 -; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, s4 +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6] +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -11,7 +11,7 @@ ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_not_b32_e32 v1, v2 +; GCN-NEXT: v_not_b32_e32 v1, v1 ; GCN-NEXT: v_or_b32_e32 v1, -5, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 @@ -39,7 +39,7 @@ ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_not_b32_e32 v2, v3 +; GCN-NEXT: v_not_b32_e32 v2, v2 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc @@ -67,7 +67,7 @@ ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_not_b32_e32 v2, v3 +; GCN-NEXT: v_not_b32_e32 v2, v2 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll --- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -129,12 +129,13 @@ ; GCN-LABEL: ds_read32_combine_stride_8192_shifted: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; VI-DAG: v_add_u32_e64 [[B1:v[0-9]+]], vcc, [[ARG]], 8 +; VI-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; GFX9: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] @@ -208,12 +209,13 @@ ; GCN-LABEL: ds_read64_combine_stride_8192_shifted: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; VI-DAG: v_add_u32_e64 [[B1:v[0-9]+]], vcc, [[ARG]], 8 +; VI-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; GFX9: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] @@ -346,12 +348,13 @@ ; GCN-LABEL: ds_write32_combine_stride_8192_shifted: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] +; VI-DAG: v_add_u32_e64 [[B1:v[0-9]+]], vcc, [[ARG]], 4 +; VI-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; GFX9: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8004, [[BASE]] @@ -409,12 +412,13 @@ ; GCN-LABEL: ds_write64_combine_stride_8192_shifted: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; VI-DAG: v_add_u32_e64 [[B1:v[0-9]+]], vcc, [[ARG]], 8 +; VI-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; GFX9: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -83,12 +83,12 @@ ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.5, align 4, addrspace 5) ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) ; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) - ; GCN: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec - ; GCN: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr1, implicit $exec + ; GCN: renamable $sgpr2 = V_READFIRSTLANE_B32 renamable $vgpr1, implicit $exec + ; GCN: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 renamable $sgpr2, killed renamable $vgpr1, implicit $exec ; GCN: renamable $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN: S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit-def undef $mode, implicit $m0, implicit $mode ; GCN: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5) - ; GCN: renamable $vgpr18 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0 + ; GCN: renamable $vgpr18 = V_MOV_B32_e32 renamable $vgpr3, implicit $exec, implicit killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0 ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GCN: renamable $vgpr19 = COPY renamable $vgpr18 ; GCN: renamable $sgpr2_sgpr3 = COPY renamable $sgpr4_sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll @@ -37,7 +37,7 @@ br label %endif endif: - %v = phi i32 [ %val, %if-true ], [ undef, %entry ] + %v = phi i32 [ %val, %if-true ], [ %value, %entry ] %r = bitcast i32 %v to float ret float %r } diff --git a/llvm/test/CodeGen/AMDGPU/machine-cp-cndmask.mir b/llvm/test/CodeGen/AMDGPU/machine-cp-cndmask.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/machine-cp-cndmask.mir @@ -0,0 +1,49 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -start-before=greedy -stop-after=machine-cp -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: remove_copy_cndmask +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_64_xexec } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sgpr_256 } + - { id: 6, class: sgpr_128 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: sreg_64 } +body: | + bb.0.entry: + ; CHECK-LABEL: name: remove_copy_cndmask + ; CHECK: renamable $sgpr2_sgpr3 = COPY $exec + ; CHECK: renamable $sgpr0 = S_MOV_B32 0 + ; CHECK: $exec = S_WQM_B64 $exec, implicit-def $scc + ; CHECK: renamable $vgpr0 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr2_sgpr3, implicit $exec + ; CHECK: $exec = S_AND_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; CHECK: renamable $sgpr1 = COPY renamable $sgpr0 + ; CHECK: renamable $sgpr2 = COPY renamable $sgpr0 + ; CHECK: renamable $sgpr3 = COPY renamable $sgpr0 + ; CHECK: renamable $sgpr4 = COPY renamable $sgpr0 + ; CHECK: renamable $sgpr5 = COPY renamable $sgpr0 + ; CHECK: renamable $sgpr6 = COPY renamable $sgpr0 + ; CHECK: renamable $sgpr7 = COPY renamable $sgpr0 + ; CHECK: renamable $vgpr0 = IMAGE_SAMPLE_V1_V1 killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + ; CHECK: SI_RETURN_TO_EPILOG $vgpr0 + %8:sreg_64 = COPY $exec + undef %4.sub0:sgpr_256 = S_MOV_B32 0 + %1:sreg_64_xexec = COPY %8:sreg_64 + $exec = S_WQM_B64 $exec, implicit-def $scc + %2:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %1:sreg_64_xexec, implicit $exec + $exec = S_AND_B64 $exec, %8:sreg_64, implicit-def $scc + %4.sub1:sgpr_256 = COPY %4.sub0:sgpr_256 + %4.sub2:sgpr_256 = COPY %4.sub0:sgpr_256 + %4.sub3:sgpr_256 = COPY %4.sub0:sgpr_256 + %4.sub4:sgpr_256 = COPY %4.sub0:sgpr_256 + %4.sub5:sgpr_256 = COPY %4.sub0:sgpr_256 + %4.sub6:sgpr_256 = COPY %4.sub0:sgpr_256 + %4.sub7:sgpr_256 = COPY %4.sub0:sgpr_256 + %7:vgpr_32 = IMAGE_SAMPLE_V1_V1 %2:vgpr_32, %4:sgpr_256, undef %6:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + $vgpr0 = COPY %7:vgpr_32 + SI_RETURN_TO_EPILOG killed $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -74,7 +74,7 @@ ; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4 ; GCN-NEXT: s_or_b64 s[2:3], s[2:3], exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir +++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter,si-fix-renamable-flags -o - %s | FileCheck -check-prefix=GCN %s # Test that subreg reassignments are correctly handled when whole register also @@ -20,8 +20,8 @@ # GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF # GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF # GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF -# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, $vcc, implicit $exec -# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed $vcc, implicit $exec +# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, renamable $vcc, implicit $exec +# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed renamable $vcc, implicit $exec # GCN: $sgpr0_sgpr1 = V_CMP_LT_U64_e64 $vgpr4_vgpr5, $vgpr0_vgpr1, implicit $exec --- name: vgpr64_mixed_use diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir +++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter,si-fix-renamable-flags -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: v1_vs_v5{{$}} # GCN: V_AND_B32_e32 killed $vgpr3, killed $vgpr1, @@ -50,7 +50,7 @@ ... # GCN-LABEL: s11_vs_vcc{{$}} -# GCN: $vgpr0, $vcc_lo = V_ADDC_U32_e64 killed $sgpr14, killed $vgpr0, killed $vcc_lo, 0 +# GCN: $vgpr0, $vcc_lo = V_ADDC_U32_e64 killed renamable $sgpr14, killed $vgpr0, killed $vcc_lo, 0 --- name: s11_vs_vcc tracksRegLiveness: true @@ -187,7 +187,7 @@ ... # GCN-LABEL: implicit{{$}} -# GCN: V_MOV_B32_indirect undef $vgpr4, undef $vgpr0, implicit $exec, implicit-def dead renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7, implicit $m0 +# GCN: V_MOV_B32_indirect undef $vgpr4, undef $vgpr0, implicit $exec, implicit-def dead $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7, implicit $m0 --- name: implicit tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/ret.ll b/llvm/test/CodeGen/AMDGPU/ret.ll --- a/llvm/test/CodeGen/AMDGPU/ret.ll +++ b/llvm/test/CodeGen/AMDGPU/ret.ll @@ -5,7 +5,7 @@ ; GCN-DAG: v_mov_b32_e32 v1, v0 ; GCN-DAG: exp mrt0 v0, v0, v0, v0 done vm ; GCN: s_waitcnt expcnt(0) -; GCN: v_add_f32_e32 v0, 1.0, v1 +; GCN: v_add_f32_e32 v0, 1.0, v{{[0-1]}} ; GCN-NOT: s_endpgm define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { bb: @@ -207,7 +207,7 @@ ; GCN-DAG: v_mov_b32_e32 v1, v0 ; GCN-DAG: s_mov_b32 s1, s2 ; GCN-DAG: s_waitcnt expcnt(0) -; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 +; GCN-DAG: v_add_f32_e32 v0, 1.0, v{{[0-1]}} ; GCN-DAG: s_add_{{i|u}}32 s0, s3, 2 ; GCN-DAG: s_mov_b32 s2, s3 ; GCN-NOT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -34,7 +34,7 @@ # SHARE: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } # SHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) -# SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# SHARE: SI_SPILL_V32_SAVE killed renamable $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) # SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) # SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 @@ -59,7 +59,7 @@ # NOSHARE: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } # NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) -# NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# NOSHARE: SI_SPILL_V32_SAVE killed renamable $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) # NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) # NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir --- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -9,7 +9,7 @@ # CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, # CHECK-NEXT: stack-id: sgpr-spill, -# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# CHECK: SI_SPILL_V32_SAVE killed renamable $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) # CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) # CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -83,15 +83,15 @@ ; GCN: bb.0.entry: ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN: liveins: $vgpr0 - ; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec - ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 renamable $vgpr0, implicit $mode, implicit $exec + ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed renamable $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec ; GCN: $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN: renamable $sgpr0_sgpr1 = S_XOR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc ; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN: bb.1.flow.preheader: ; GCN: successors: %bb.2(0x80000000) ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 - ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed renamable $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec ; GCN: renamable $sgpr2_sgpr3 = S_MOV_B64 0 ; GCN: bb.2.flow: ; GCN: successors: %bb.3(0x04000000), %bb.2(0x7c000000)