[AMDGPU] Fix issues for backend divergence tracking

dstutt · dstutt · commit 31f482c26bc4 · 2018-04-18T13:53:31.000Z
Summary: A change to use divergence analysis in the AMDGPU backend was getting formal arguments incorrect (not tagged as divergent) unless they were VGPR0, VGPR1 or VGPR2 For graphics shaders it is possible to have more than these passed in as VGPR Modified the checking code to check for any VGPR registers passed in as formal arguments. Also, some intrinsics that are sources of divergence may have been lowered during instruction selection and are missed on subsequent calls to isSDNodeSourceOfDivergence - added the relevant AMDGPUISD checks as well. Finally, the FunctionLoweringInfo tracks virtual registers that are live across basic block boundaries. This is used to check for divergence of CopyFromRegister registers using the DivergenceAnalysis analysis. For multiple blocks the lazily evaluated inverted map VirtReg2Value was not cleared when the ValueMap map was. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D45372 Change-Id: I112f3bd6dfe0f62e63ce9b43b893982778e4bee3 llvm-svn: 330257
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -312,6 +312,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
 void FunctionLoweringInfo::clear() {
   MBBMap.clear();
   ValueMap.clear();
+  VirtReg2Value.clear();
   StaticAllocaMap.clear();
   LiveOutRegInfo.clear();
   VisitedBBs.clear();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -806,12 +806,11 @@ bool AMDGPUTargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
 
         if (MRI.isLiveIn(Reg)) {
           // workitem.id.x workitem.id.y workitem.id.z
+          // Any VGPR formal argument is also considered divergent
           if ((MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_X) ||
               (MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Y) ||
-              (MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Z)||
-              (MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR0) ||
-            (MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR1) ||
-            (MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR2))
+              (MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Z) ||
+              (TRI.isVGPR(MRI, Reg)))
               return true;
           // Formal arguments of non-entry functions
           // are conservatively considered divergent
@@ -840,6 +839,12 @@ bool AMDGPUTargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
     case ISD::INTRINSIC_W_CHAIN:
       return AMDGPU::isIntrinsicSourceOfDivergence(
       cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    // In some cases intrinsics that are a source of divergence have been
+    // lowered to AMDGPUISD so we also need to check those too.
+    case AMDGPUISD::INTERP_MOV:
+    case AMDGPUISD::INTERP_P1:
+    case AMDGPUISD::INTERP_P2:
+      return true;
   }
   return false;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll b/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,VI,PREGFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,PREGFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; A test case that originally failed in divergence calculation
+; Implementation has to identify all formal args that can be a source of divergence
+
+@0 = external dso_local addrspace(4) constant [6 x <2 x float>]
+
+; GCN-LABEL: {{^}}_amdgpu_vs_main:
+; GCN-NOT: v_readfirstlane
+; PRE-GFX9: flat_load_dword
+; GFX9: global_load 
+define dllexport amdgpu_vs void @_amdgpu_vs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8) local_unnamed_addr #0 {
+.entry:
+  %tmp = add i32 %arg4, %arg8
+  %tmp9 = sext i32 %tmp to i64
+  %tmp10 = getelementptr [6 x <2 x float>], [6 x <2 x float>] addrspace(4)* @0, i64 0, i64 %tmp9
+  %tmp11 = load <2 x float>, <2 x float> addrspace(4)* %tmp10, align 8
+  %tmp12 = fadd nnan arcp contract <2 x float> zeroinitializer, %tmp11
+  %tmp13 = extractelement <2 x float> %tmp12, i32 1
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float undef, float %tmp13, float 0.000000e+00, float 1.000000e+00, i1 true, i1 false) #1
+  ret void
+}
+
+declare i64 @llvm.amdgcn.s.getpc() #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll b/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,VI,PREGFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,PREGFX9 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; Testing for failures in divergence calculations when divergent intrinsic is lowered during instruction selection
+
+@0 = external dso_local addrspace(4) constant [4 x <4 x float>]
+
+; GCN-LABEL: {{^}}_amdgpu_ps_main:
+; GCN-NOT: v_readfirstlane
+; PRE-GFX9: flat_load_dword
+; GFX9: global_load 
+define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %arg) local_unnamed_addr #0 {
+.entry:
+  %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg) #1
+  %tmp1 = bitcast float %tmp to i32
+  %tmp2 = srem i32 %tmp1, 4
+  %tmp3 = select i1 false, i32 undef, i32 %tmp2
+  %tmp4 = sext i32 %tmp3 to i64
+  %tmp5 = getelementptr [4 x <4 x float>], [4 x <4 x float>] addrspace(4)* @0, i64 0, i64 %tmp4
+  %tmp6 = load <4 x float>, <4 x float> addrspace(4)* %tmp5, align 16
+  %tmp7 = extractelement <4 x float> %tmp6, i32 3
+  %tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7) #1
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> undef, <2 x half> %tmp8, i1 true, i1 true) #2
+  ret void
+}
+
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #2
+
+attributes #0 = { nounwind "InitialPSInputAddr"="0" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind }