Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -787,6 +787,56 @@ } break; } + case AMDGPU::V_WRITELANE_B32: { + // Some architectures allow more than one constant bus access without + // SGPR restriction + if (ST.getConstantBusLimit(MI.getOpcode()) != 1) + break; + + // Writelane is special in that it can use SGPR and M0 (which would + // normally + // count as using the constant bus twice - but in this case it is + // allowed as the lane selector doesn't count as a use of the constant + // bus). However, it is still required to abide by the 1 SGPR rule Apply + // a fix here as we might have multiple SGPRs after legalizing VGPRs to + // SGPRs + int Src0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + int Src1Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); + + // Check to see if the instruction violates the 1 SGPR rule + if ((Src0.isReg() && TRI->isSGPRReg(MRI, Src0.getReg()) && + Src0.getReg() != AMDGPU::M0) && + (Src1.isReg() && TRI->isSGPRReg(MRI, Src1.getReg()) && + Src1.getReg() != AMDGPU::M0)) { + + // Check for trivially easy constant prop into one of the operands + // If this is the case then perform the operation now to resolve SGPR + // issue + bool Resolved = false; + for (auto MO : {&Src0, &Src1}) { + auto Imm = TII->foldToImm(*MO, &MRI); + if (Imm && TII->isInlineConstant(APInt(64, *Imm, true))) { + MO->ChangeToImmediate(*Imm); + Resolved = true; + break; + } + } + + if (!Resolved) { + // Haven't managed to resolve by replacing an SGPR with an immediate + // Move src1 to be in M0 + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::COPY), AMDGPU::M0) + .add(Src1); + Src1.ChangeToRegister(AMDGPU::M0, false); + } + } + break; + } } } } Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -1018,6 +1018,11 @@ } void fixImplicitOperands(MachineInstr &MI) const; + + /// \brief Return immediate value of operand if possible to do so + Optional foldToImm(const MachineOperand &Op, + const MachineRegisterInfo *MRI) const; + }; /// \brief Returns true if a reg:subreg pair P has a TRC class Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3432,6 +3432,33 @@ } } + // Special case for writelane - this can break the multiple constant bus rule, + // but still can't use more than one SGPR register + if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { + + unsigned SGPRCount = 0; + unsigned SGPRUsed = AMDGPU::NoRegister; + + for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { + if (OpIdx == -1) + break; + + const MachineOperand &MO = MI.getOperand(OpIdx); + + if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { + if (MO.isReg() && MO.getReg() != AMDGPU::M0) { + if (MO.getReg() != SGPRUsed) + ++SGPRCount; + SGPRUsed = MO.getReg(); + } + } + if (SGPRCount > ST.getConstantBusLimit(Opcode)) { + ErrInfo = "WRITELANE instruction violates constant bus restriction"; + return false; + } + } + } + // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { @@ -6447,3 +6474,38 @@ } bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } + +static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { + return LHS.isReg() && + RHS.isReg() && + LHS.getReg() == RHS.getReg() && + LHS.getSubReg() == RHS.getSubReg(); +} + +Optional SIInstrInfo::foldToImm(const MachineOperand &Op, + const MachineRegisterInfo *MRI) const { + if (Op.isImm()) { + return Op.getImm(); + } + + // If this is not immediate then it can be copy of immediate value, e.g.: + // %1 = S_MOV_B32 255; + if (Op.isReg()) { + for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { + if (!isSameReg(Op, Def)) + continue; + + const MachineInstr *DefInst = Def.getParent(); + if (!isFoldableCopy(*DefInst)) + return None; + + const MachineOperand &Copied = DefInst->getOperand(1); + if (!Copied.isImm()) + return None; + + return Copied.getImm(); + } + } + + return None; +} Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -77,8 +77,6 @@ std::unordered_map PotentialMatches; SmallVector ConvertedInstructions; - Optional foldToImm(const MachineOperand &Op) const; - public: static char ID; @@ -519,33 +517,6 @@ return SDWADstOperand::convertToSDWA(MI, TII); } -Optional SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { - if (Op.isImm()) { - return Op.getImm(); - } - - // If this is not immediate then it can be copy of immediate value, e.g.: - // %1 = S_MOV_B32 255; - if (Op.isReg()) { - for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { - if (!isSameReg(Op, Def)) - continue; - - const MachineInstr *DefInst = Def.getParent(); - if (!TII->isFoldableCopy(*DefInst)) - return None; - - const MachineOperand &Copied = DefInst->getOperand(1); - if (!Copied.isImm()) - return None; - - return Copied.getImm(); - } - } - - return None; -} - std::unique_ptr SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { unsigned Opcode = MI.getOpcode(); @@ -565,7 +536,7 @@ // from: v_lshlrev_b32_e32 v1, 16/24, v0 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - auto Imm = foldToImm(*Src0); + auto Imm = TII->foldToImm(*Src0, MRI); if (!Imm) break; @@ -606,7 +577,7 @@ // from: v_lshlrev_b16_e32 v1, 8, v0 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - auto Imm = foldToImm(*Src0); + auto Imm = TII->foldToImm(*Src0, MRI); if (!Imm || *Imm != 8) break; @@ -646,12 +617,12 @@ // 24 | 8 | BYTE_3 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - auto Offset = foldToImm(*Src1); + auto Offset = TII->foldToImm(*Src1, MRI); if (!Offset) break; MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - auto Width = foldToImm(*Src2); + auto Width = TII->foldToImm(*Src2, MRI); if (!Width) break; @@ -694,10 +665,10 @@ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); auto ValSrc = Src1; - auto Imm = foldToImm(*Src0); + auto Imm = TII->foldToImm(*Src0, MRI); if (!Imm) { - Imm = foldToImm(*Src1); + Imm = TII->foldToImm(*Src1, MRI); ValSrc = Src0; } Index: test/CodeGen/AMDGPU/inserted-wait-states.mir =================================================================== --- test/CodeGen/AMDGPU/inserted-wait-states.mir +++ test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -317,8 +317,9 @@ S_BRANCH %bb.3 bb.3: + $m0 = S_MOV_B32 $sgpr4 $vgpr0,implicit $vcc = V_ADD_I32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec - $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo, $vgpr4 + $vgpr4 = V_WRITELANE_B32 $m0, $vcc_lo, $vgpr4 S_ENDPGM 0 ... Index: test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -1,10 +1,12 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CI,CIGFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX9,CIGFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0 ; CHECK-LABEL: {{^}}test_writelane_sreg: -; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0 +; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { %oldval = load i32, i32 addrspace(1)* %out %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval) @@ -35,11 +37,11 @@ ret void } -; TODO: m0 should be folded. ; CHECK-LABEL: {{^}}test_writelane_m0_sreg: ; CHECK: s_mov_b32 m0, -1 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 -; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}} +; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0 +; GFX10: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}} define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { %oldval = load i32, i32 addrspace(1)* %out %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() @@ -59,7 +61,8 @@ ; CHECK-LABEL: {{^}}test_writelane_sreg_oldval: ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}} -; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}} +; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0 +; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval) store i32 %writelane, i32 addrspace(1)* %out, align 4 @@ -68,7 +71,8 @@ ; CHECK-LABEL: {{^}}test_writelane_imm_oldval: ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42 -; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}} +; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0 +; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_writelane_imm_oldval(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42) store i32 %writelane, i32 addrspace(1)* %out, align 4