Skip to content

Commit 2d6a230

Browse files
committedOct 16, 2019
[AMDGPU] Fix-up cases where writelane has 2 SGPR operands
Summary: Even though writelane doesn't have the same constraints as other valu instructions it still can't violate the >1 SGPR operand constraint Due to later register propagation (e.g. fixing up vgpr operands via readfirstlane) changing writelane to only have a single SGPR is tricky. This implementation puts a new check after SIFixSGPRCopies that prevents multiple SGPRs being used in any writelane instructions. The algorithm used is to check for trivial copy prop of suitable constants into one of the SGPR operands and perform that if possible. If this isn't possible put an explicit copy of Src1 SGPR into M0 and use that instead (this is allowable for writelane as the constraint is for SGPR read-port and not constant-bus access). Reviewers: rampitec, tpr, arsenm, nhaehnle Reviewed By: rampitec, arsenm, nhaehnle Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, mgorny, yaxunl, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D51932 Change-Id: Ic7553fa57440f208d4dbc4794fc24345d7e0e9ea llvm-svn: 375004
1 parent c14f1ea commit 2d6a230

File tree

4 files changed

+100
-8
lines changed

4 files changed

+100
-8
lines changed
 

‎llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

+61
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,67 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
684684
}
685685
break;
686686
}
687+
case AMDGPU::V_WRITELANE_B32: {
688+
// Some architectures allow more than one constant bus access without
689+
// SGPR restriction
690+
if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
691+
break;
692+
693+
// Writelane is special in that it can use SGPR and M0 (which would
694+
// normally count as using the constant bus twice - but in this case it
695+
// is allowed since the lane selector doesn't count as a use of the
696+
// constant bus). However, it is still required to abide by the 1 SGPR
697+
// rule. Apply a fix here as we might have multiple SGPRs after
698+
// legalizing VGPRs to SGPRs
699+
int Src0Idx =
700+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
701+
int Src1Idx =
702+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
703+
MachineOperand &Src0 = MI.getOperand(Src0Idx);
704+
MachineOperand &Src1 = MI.getOperand(Src1Idx);
705+
706+
// Check to see if the instruction violates the 1 SGPR rule
707+
if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
708+
Src0.getReg() != AMDGPU::M0) &&
709+
(Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
710+
Src1.getReg() != AMDGPU::M0)) {
711+
712+
// Check for trivially easy constant prop into one of the operands
713+
// If this is the case then perform the operation now to resolve SGPR
714+
// issue. If we don't do that here we will always insert a mov to m0
715+
// that can't be resolved in later operand folding pass
716+
bool Resolved = false;
717+
for (MachineOperand *MO : {&Src0, &Src1}) {
718+
if (Register::isVirtualRegister(MO->getReg())) {
719+
MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
720+
if (DefMI && TII->isFoldableCopy(*DefMI)) {
721+
const MachineOperand &Def = DefMI->getOperand(0);
722+
if (Def.isReg() &&
723+
MO->getReg() == Def.getReg() &&
724+
MO->getSubReg() == Def.getSubReg()) {
725+
const MachineOperand &Copied = DefMI->getOperand(1);
726+
if (Copied.isImm() &&
727+
TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
728+
MO->ChangeToImmediate(Copied.getImm());
729+
Resolved = true;
730+
break;
731+
}
732+
}
733+
}
734+
}
735+
}
736+
737+
if (!Resolved) {
738+
// Haven't managed to resolve by replacing an SGPR with an immediate
739+
// Move src1 to be in M0
740+
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
741+
TII->get(AMDGPU::COPY), AMDGPU::M0)
742+
.add(Src1);
743+
Src1.ChangeToRegister(AMDGPU::M0, false);
744+
}
745+
}
746+
break;
747+
}
687748
}
688749
}
689750
}

‎llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+26
Original file line numberDiff line numberDiff line change
@@ -3495,6 +3495,32 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
34953495
}
34963496
}
34973497

3498+
// Special case for writelane - this can break the multiple constant bus rule,
3499+
// but still can't use more than one SGPR register
3500+
if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
3501+
unsigned SGPRCount = 0;
3502+
Register SGPRUsed = AMDGPU::NoRegister;
3503+
3504+
for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
3505+
if (OpIdx == -1)
3506+
break;
3507+
3508+
const MachineOperand &MO = MI.getOperand(OpIdx);
3509+
3510+
if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
3511+
if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
3512+
if (MO.getReg() != SGPRUsed)
3513+
++SGPRCount;
3514+
SGPRUsed = MO.getReg();
3515+
}
3516+
}
3517+
if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
3518+
ErrInfo = "WRITELANE instruction violates constant bus restriction";
3519+
return false;
3520+
}
3521+
}
3522+
}
3523+
34983524
// Verify misc. restrictions on specific instructions.
34993525
if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
35003526
Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {

‎llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir

+2-1
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,9 @@ body: |
317317
S_BRANCH %bb.3
318318
319319
bb.3:
320+
$m0 = S_MOV_B32 $sgpr4
320321
$vgpr0,implicit $vcc = V_ADD_I32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
321-
$vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo, $vgpr4
322+
$vgpr4 = V_WRITELANE_B32 $m0, $vcc_lo, $vgpr4
322323
S_ENDPGM 0
323324
324325
...

‎llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll

+11-7
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s
2-
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s
1+
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CI,CIGFX9 %s
2+
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX9,CIGFX9 %s
3+
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
34

45
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
56

67
; CHECK-LABEL: {{^}}test_writelane_sreg:
7-
; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
8+
; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
9+
; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
810
define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
911
%oldval = load i32, i32 addrspace(1)* %out
1012
%writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
@@ -35,11 +37,11 @@ define amdgpu_kernel void @test_writelane_vreg_lane(i32 addrspace(1)* %out, <2 x
3537
ret void
3638
}
3739

38-
; TODO: m0 should be folded.
3940
; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
4041
; CHECK: s_mov_b32 m0, -1
4142
; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
42-
; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}}
43+
; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0
44+
; GFX10: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}}
4345
define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
4446
%oldval = load i32, i32 addrspace(1)* %out
4547
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
@@ -59,7 +61,8 @@ define amdgpu_kernel void @test_writelane_imm(i32 addrspace(1)* %out, i32 %src0)
5961

6062
; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
6163
; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
62-
; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
64+
; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
65+
; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
6366
define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
6467
%writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
6568
store i32 %writelane, i32 addrspace(1)* %out, align 4
@@ -68,7 +71,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 add
6871

6972
; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
7073
; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
71-
; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
74+
; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
75+
; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
7276
define amdgpu_kernel void @test_writelane_imm_oldval(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
7377
%writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
7478
store i32 %writelane, i32 addrspace(1)* %out, align 4

0 commit comments

Comments
 (0)