Skip to content

Commit 92e01ee

Browse files
committedNov 7, 2016
[AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies
Codegen prepare sinks comparisons close to a user is we have only one register for conditions. For AMDGPU we have many SGPRs capable to hold vector conditions. Changed BE to report we have many condition registers. That way IR LICM pass would hoist an invariant comparison out of a loop and codegen prepare will not sink it. With that done a condition is calculated in one block and used in another. Current behavior is to store workitem's condition in a VGPR using v_cndmask and then restore it with yet another v_cmp instruction from that v_cndmask's result. To mitigate the issue a forward propagation of a v_cmp 64 bit result to an user is implemented. Additional side effect of this is that we may consume less VGPRs in a cost of more SGPRs in case if holding of multiple conditions is needed, and that is a clear win in most cases. llvm-svn: 286171
1 parent 0298082 commit 92e01ee

File tree

4 files changed

+75
-8
lines changed

4 files changed

+75
-8
lines changed
 

‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
440440

441441
setSchedulingPreference(Sched::RegPressure);
442442
setJumpIsExpensive(true);
443+
setHasMultipleConditionRegisters(true);
443444

444445
// SI at least has hardware support for floating point exceptions, but no way
445446
// of using or handling them is implemented. They are also optional in OpenCL

‎llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

+25-5
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,31 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
121121
}
122122
}
123123

124-
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
125-
.addOperand(Dst)
126-
.addImm(0)
127-
.addImm(-1)
128-
.addOperand(Src);
124+
// If there are uses which are just a copy back from this new VReg_1
125+
// to another SGPR_64 just forward propagate original SGPR_64.
126+
SmallVector<MachineInstr *, 4> RegUses;
127+
for (auto &Use : MRI.use_instructions(Dst.getReg()))
128+
if (Use.isFullCopy())
129+
RegUses.push_back(&Use);
130+
131+
while (!RegUses.empty()) {
132+
MachineInstr *Use = RegUses.pop_back_val();
133+
if (Use->getOperand(1).getReg() == Dst.getReg()) {
134+
unsigned RegCopy = Use->getOperand(0).getReg();
135+
if (!TargetRegisterInfo::isVirtualRegister(RegCopy))
136+
continue;
137+
Use->eraseFromParent();
138+
MRI.replaceRegWith(RegCopy, Src.getReg());
139+
}
140+
}
141+
142+
if (!MRI.use_empty(Dst.getReg()))
143+
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
144+
.addOperand(Dst)
145+
.addImm(0)
146+
.addImm(-1)
147+
.addOperand(Src);
148+
129149
MI.eraseFromParent();
130150
} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
131151
SrcRC == &AMDGPU::VReg_1RegClass) {

‎llvm/test/CodeGen/AMDGPU/branch-relaxation.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ bb3:
9090
; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch:
9191
; GCN: s_load_dword [[CND:s[0-9]+]]
9292
; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
93-
; GCN-DAG: v_cmp_eq_f32_e64 vcc, [[CND]], 0
93+
; GCN-DAG: v_cmp_eq_f32_e64 {{vcc|(s\[[0-9]+:[0-9]+\])}}, [[CND]], 0
9494
; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
9595

9696
; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
@@ -492,8 +492,8 @@ ret:
492492
; GCN: s_setpc_b64
493493

494494
; GCN: [[LONG_BR_DEST0]]
495-
; GCN: s_cmp_eq_u32
496-
; GCN-NEXT: s_cbranch_scc0
495+
; GCN: v_cmp_ne_u32_e32
496+
; GCN-NEXT: s_cbranch_vccz
497497
; GCN: s_setpc_b64
498498

499499
; GCN: s_endpgm

‎llvm/test/CodeGen/AMDGPU/host-cond.ll

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
2+
3+
; Check that invariant compare is hoisted out of the loop.
4+
; At the same time condition shall not be serialized into a VGPR and deserialized later
5+
; using another v_cmp + v_cndmask, but used directly in s_and_saveexec_b64.
6+
7+
; CHECK: v_cmp_{{..}}_u32_e64 [[COND:s\[[0-9]+:[0-9]+\]]]
8+
; CHECK: BB0_1:
9+
; CHECK-NOT: v_cmp
10+
; CHECK_NOT: v_cndmask
11+
; CHECK: s_and_saveexec_b64 s[{{[[0-9]+:[0-9]+}}], [[COND]]
12+
; CHECK: BB0_2:
13+
14+
define amdgpu_kernel void @hoist_cond(float addrspace(1)* nocapture %arg, float addrspace(1)* noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) {
15+
bb:
16+
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
17+
%tmp5 = icmp ult i32 %tmp, %arg3
18+
br label %bb1
19+
20+
bb1: ; preds = %bb3, %bb
21+
%tmp7 = phi i32 [ %arg4, %bb ], [ %tmp16, %bb3 ]
22+
%tmp8 = phi float [ 0.000000e+00, %bb ], [ %tmp15, %bb3 ]
23+
br i1 %tmp5, label %bb2, label %bb3
24+
25+
bb2: ; preds = %bb1
26+
%tmp10 = zext i32 %tmp7 to i64
27+
%tmp11 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %tmp10
28+
%tmp12 = load float, float addrspace(1)* %tmp11, align 4
29+
br label %bb3
30+
31+
bb3: ; preds = %bb2, %bb1
32+
%tmp14 = phi float [ %tmp12, %bb2 ], [ 0.000000e+00, %bb1 ]
33+
%tmp15 = fadd float %tmp8, %tmp14
34+
%tmp16 = add i32 %tmp7, -1
35+
%tmp17 = icmp eq i32 %tmp16, 0
36+
br i1 %tmp17, label %bb4, label %bb1
37+
38+
bb4: ; preds = %bb3
39+
store float %tmp15, float addrspace(1)* %arg, align 4
40+
ret void
41+
}
42+
43+
; Function Attrs: nounwind readnone
44+
declare i32 @llvm.amdgcn.workitem.id.x() #0
45+
46+
attributes #0 = { nounwind readnone }

0 commit comments

Comments
 (0)
Please sign in to comment.