Skip to content

Commit 31f482c

Browse files
committedApr 18, 2018
[AMDGPU] Fix issues for backend divergence tracking
Summary: A change to use divergence analysis in the AMDGPU backend was getting formal arguments incorrect (not tagged as divergent) unless they were VGPR0, VGPR1 or VGPR2 For graphics shaders it is possible to have more than these passed in as VGPR Modified the checking code to check for any VGPR registers passed in as formal arguments. Also, some intrinsics that are sources of divergence may have been lowered during instruction selection and are missed on subsequent calls to isSDNodeSourceOfDivergence - added the relevant AMDGPUISD checks as well. Finally, the FunctionLoweringInfo tracks virtual registers that are live across basic block boundaries. This is used to check for divergence of CopyFromRegister registers using the DivergenceAnalysis analysis. For multiple blocks the lazily evaluated inverted map VirtReg2Value was not cleared when the ValueMap map was. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D45372 Change-Id: I112f3bd6dfe0f62e63ce9b43b893982778e4bee3 llvm-svn: 330257
1 parent 3c19051 commit 31f482c

File tree

4 files changed

+74
-4
lines changed

4 files changed

+74
-4
lines changed
 

‎llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
312312
void FunctionLoweringInfo::clear() {
313313
MBBMap.clear();
314314
ValueMap.clear();
315+
VirtReg2Value.clear();
315316
StaticAllocaMap.clear();
316317
LiveOutRegInfo.clear();
317318
VisitedBBs.clear();

‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+9-4
Original file line numberDiff line numberDiff line change
@@ -806,12 +806,11 @@ bool AMDGPUTargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
806806

807807
if (MRI.isLiveIn(Reg)) {
808808
// workitem.id.x workitem.id.y workitem.id.z
809+
// Any VGPR formal argument is also considered divergent
809810
if ((MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_X) ||
810811
(MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Y) ||
811-
(MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Z)||
812-
(MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR0) ||
813-
(MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR1) ||
814-
(MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR2))
812+
(MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Z) ||
813+
(TRI.isVGPR(MRI, Reg)))
815814
return true;
816815
// Formal arguments of non-entry functions
817816
// are conservatively considered divergent
@@ -840,6 +839,12 @@ bool AMDGPUTargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
840839
case ISD::INTRINSIC_W_CHAIN:
841840
return AMDGPU::isIntrinsicSourceOfDivergence(
842841
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
842+
// In some cases intrinsics that are a source of divergence have been
843+
// lowered to AMDGPUISD so we also need to check those too.
844+
case AMDGPUISD::INTERP_MOV:
845+
case AMDGPUISD::INTERP_P1:
846+
case AMDGPUISD::INTERP_P2:
847+
return true;
843848
}
844849
return false;
845850
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,VI,PREGFX9 %s
2+
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,PREGFX9 %s
3+
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
4+
5+
; A test case that originally failed in divergence calculation
6+
; Implementation has to identify all formal args that can be a source of divergence
7+
8+
@0 = external dso_local addrspace(4) constant [6 x <2 x float>]
9+
10+
; GCN-LABEL: {{^}}_amdgpu_vs_main:
11+
; GCN-NOT: v_readfirstlane
12+
; PRE-GFX9: flat_load_dword
13+
; GFX9: global_load
14+
define dllexport amdgpu_vs void @_amdgpu_vs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8) local_unnamed_addr #0 {
15+
.entry:
16+
%tmp = add i32 %arg4, %arg8
17+
%tmp9 = sext i32 %tmp to i64
18+
%tmp10 = getelementptr [6 x <2 x float>], [6 x <2 x float>] addrspace(4)* @0, i64 0, i64 %tmp9
19+
%tmp11 = load <2 x float>, <2 x float> addrspace(4)* %tmp10, align 8
20+
%tmp12 = fadd nnan arcp contract <2 x float> zeroinitializer, %tmp11
21+
%tmp13 = extractelement <2 x float> %tmp12, i32 1
22+
call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float undef, float %tmp13, float 0.000000e+00, float 1.000000e+00, i1 true, i1 false) #1
23+
ret void
24+
}
25+
26+
declare i64 @llvm.amdgcn.s.getpc() #0
27+
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
28+
29+
attributes #0 = { nounwind readnone speculatable }
30+
attributes #1 = { nounwind }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,VI,PREGFX9 %s
2+
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,PREGFX9 %s
3+
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
4+
5+
; Testing for failures in divergence calculations when divergent intrinsic is lowered during instruction selection
6+
7+
@0 = external dso_local addrspace(4) constant [4 x <4 x float>]
8+
9+
; GCN-LABEL: {{^}}_amdgpu_ps_main:
10+
; GCN-NOT: v_readfirstlane
11+
; PRE-GFX9: flat_load_dword
12+
; GFX9: global_load
13+
define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %arg) local_unnamed_addr #0 {
14+
.entry:
15+
%tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg) #1
16+
%tmp1 = bitcast float %tmp to i32
17+
%tmp2 = srem i32 %tmp1, 4
18+
%tmp3 = select i1 false, i32 undef, i32 %tmp2
19+
%tmp4 = sext i32 %tmp3 to i64
20+
%tmp5 = getelementptr [4 x <4 x float>], [4 x <4 x float>] addrspace(4)* @0, i64 0, i64 %tmp4
21+
%tmp6 = load <4 x float>, <4 x float> addrspace(4)* %tmp5, align 16
22+
%tmp7 = extractelement <4 x float> %tmp6, i32 3
23+
%tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7) #1
24+
call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> undef, <2 x half> %tmp8, i1 true, i1 true) #2
25+
ret void
26+
}
27+
28+
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
29+
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
30+
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #2
31+
32+
attributes #0 = { nounwind "InitialPSInputAddr"="0" }
33+
attributes #1 = { nounwind readnone speculatable }
34+
attributes #2 = { nounwind }

0 commit comments

Comments
 (0)
Please sign in to comment.