diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -43,6 +43,11 @@ cl::init(false), cl::ReallyHidden); +static cl::opt DisableNSAforBVH( + "amdgpu-global-isel-disable-nsa-for-bvh", cl::Hidden, + cl::desc("Do not use NSA for BVH instructions in GlobalISel"), + cl::init(false)); + static constexpr unsigned MaxRegisterSize = 1024; // Round the number of elements to the next power of two elements @@ -4667,12 +4672,21 @@ bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; - unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa - : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + unsigned Opcode; - SmallVector Ops; + if (DisableNSAforBVH) { + Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa + : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa; + } else { + Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa + : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + } + + SmallVector Ops; if (Is64) { auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); Ops.push_back(Unmerge.getReg(0)); @@ -4707,6 +4721,23 @@ packLanes(RayInvDir); } + if (DisableNSAforBVH) { + // Build a single vector containing all the operands so far prepared. + const unsigned LaneCount = !Is64 && IsA16 ? 8 : 16; + + while (Ops.size() < LaneCount) { + Register R = MRI.createGenericVirtualRegister(S32); + B.buildConstant(R, 0); + Ops.push_back(R); + } + + LLT OpTy = LLT::vector(Ops.size(), 32); + Register MergedOps = MRI.createGenericVirtualRegister(OpTy); + B.buildMerge(MergedOps, Ops); + Ops.clear(); + Ops.push_back(MergedOps); + } + auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) .addDef(DstReg) .addImm(Opcode); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4176,8 +4176,14 @@ unsigned N = MI.getNumExplicitOperands() - 2; OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); - for (unsigned I = 2; I < N; ++I) - OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + if (N == 3) { + // Sequential form: all operands combined into VGPR256/VGPR512 + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 512); + } else { + // NSA form + for (unsigned I = 2; I < N; ++I) + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + } break; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -51,6 +51,11 @@ cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); +static cl::opt + DisableNSAforBVH("amdgpu-disable-nsa-for-bvh", cl::Hidden, + cl::desc("Do not use NSA for BVH instructions"), + cl::init(false)); + static bool hasFP32Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo(); return Info->getMode().allFP32Denormals(); @@ -7312,10 +7317,19 @@ bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; bool Is64 = NodePtr.getValueType() == MVT::i64; - unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa - : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + unsigned Opcode; + + if (DisableNSAforBVH) { + Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa + : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa; + } else { + Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa + : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + } SmallVector Ops; @@ -7355,6 +7369,19 @@ packLanes(RayOrigin, true); packLanes(RayDir, true); packLanes(RayInvDir, false); + + if (DisableNSAforBVH) { + // Build a single vector containing all the operands so far prepared. + const unsigned LaneCount = !Is64 && IsA16 ? 8 : 16; + while (Ops.size() < LaneCount) + Ops.push_back(DAG.getConstant(0, DL, MVT::i32)); + + SDValue MergedOps = DAG.getBuildVector( + Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops); + Ops.clear(); + Ops.push_back(MergedOps); + } + Ops.push_back(TDescr); if (IsA16) Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1)); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-global-isel-disable-nsa-for-bvh < %s | FileCheck -check-prefix=GCN-SA %s ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) @@ -17,6 +18,23 @@ ; GCN-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GCN-SA-LABEL: image_bvh_intersect_ray: +; GCN-SA: ; %bb.0: +; GCN-SA-NEXT: v_mov_b32_e32 v9, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-SA-NEXT: v_mov_b32_e32 v5, v6 +; GCN-SA-NEXT: v_mov_b32_e32 v6, v7 +; GCN-SA-NEXT: v_mov_b32_e32 v7, v8 +; GCN-SA-NEXT: v_mov_b32_e32 v8, v10 +; GCN-SA-NEXT: v_mov_b32_e32 v10, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v12, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v13, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v14, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v15, v11 +; GCN-SA-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-SA-NEXT: s_waitcnt vmcnt(0) +; GCN-SA-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -38,6 +56,22 @@ ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GCN-SA-LABEL: image_bvh_intersect_ray_a16: +; GCN-SA: ; %bb.0: +; GCN-SA-NEXT: s_mov_b32 s4, 0xffff +; GCN-SA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-SA-NEXT: v_and_b32_e32 v10, s4, v8 +; GCN-SA-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-SA-NEXT: v_and_b32_e32 v9, s4, v9 +; GCN-SA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-SA-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-SA-NEXT: v_and_or_b32 v5, v6, s4, v5 +; GCN-SA-NEXT: v_and_or_b32 v6, v7, s4, v10 +; GCN-SA-NEXT: v_lshl_or_b32 v7, v9, 16, v8 +; GCN-SA-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GCN-SA-NEXT: s_waitcnt vmcnt(0) +; GCN-SA-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -49,6 +83,22 @@ ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GCN-SA-LABEL: image_bvh64_intersect_ray: +; GCN-SA: ; %bb.0: +; GCN-SA-NEXT: v_mov_b32_e32 v10, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-SA-NEXT: v_mov_b32_e32 v6, v7 +; GCN-SA-NEXT: v_mov_b32_e32 v7, v8 +; GCN-SA-NEXT: v_mov_b32_e32 v8, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v9, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v11, v13 +; GCN-SA-NEXT: v_mov_b32_e32 v13, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v14, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v15, v12 +; GCN-SA-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-SA-NEXT: s_waitcnt vmcnt(0) +; GCN-SA-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -70,6 +120,29 @@ ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GCN-SA-LABEL: image_bvh64_intersect_ray_a16: +; GCN-SA: ; %bb.0: +; GCN-SA-NEXT: s_mov_b32 s4, 0xffff +; GCN-SA-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GCN-SA-NEXT: v_and_b32_e32 v11, s4, v9 +; GCN-SA-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v9, 0 +; GCN-SA-NEXT: v_and_b32_e32 v10, s4, v10 +; GCN-SA-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-SA-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v13, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v14, v9 +; GCN-SA-NEXT: v_and_or_b32 v6, v7, s4, v6 +; GCN-SA-NEXT: v_and_or_b32 v7, v8, s4, v11 +; GCN-SA-NEXT: v_lshl_or_b32 v8, v10, 16, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v10, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v11, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v12, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v15, v9 +; GCN-SA-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 +; GCN-SA-NEXT: s_waitcnt vmcnt(0) +; GCN-SA-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -99,6 +172,43 @@ ; GCN-NEXT: v_mov_b32_e32 v2, v20 ; GCN-NEXT: v_mov_b32_e32 v3, v21 ; GCN-NEXT: ; return to shader part epilog +; +; GCN-SA-LABEL: image_bvh_intersect_ray_vgpr_descr: +; GCN-SA: ; %bb.0: +; GCN-SA-NEXT: v_mov_b32_e32 v9, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-SA-NEXT: v_mov_b32_e32 v5, v6 +; GCN-SA-NEXT: v_mov_b32_e32 v6, v7 +; GCN-SA-NEXT: v_mov_b32_e32 v7, v8 +; GCN-SA-NEXT: v_mov_b32_e32 v8, v10 +; GCN-SA-NEXT: v_mov_b32_e32 v10, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v18, v14 +; GCN-SA-NEXT: v_mov_b32_e32 v19, v15 +; GCN-SA-NEXT: v_mov_b32_e32 v12, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v13, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v14, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v15, v11 +; GCN-SA-NEXT: s_mov_b32 s1, exec_lo +; GCN-SA-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1 +; GCN-SA-NEXT: v_readfirstlane_b32 s4, v18 +; GCN-SA-NEXT: v_readfirstlane_b32 s5, v19 +; GCN-SA-NEXT: v_readfirstlane_b32 s6, v16 +; GCN-SA-NEXT: v_readfirstlane_b32 s7, v17 +; GCN-SA-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] +; GCN-SA-NEXT: image_bvh_intersect_ray v[20:23], v[0:15], s[4:7] +; GCN-SA-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] +; GCN-SA-NEXT: s_and_b32 s0, s0, vcc_lo +; GCN-SA-NEXT: s_and_saveexec_b32 s0, s0 +; GCN-SA-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GCN-SA-NEXT: s_cbranch_execnz BB4_1 +; GCN-SA-NEXT: ; %bb.2: +; GCN-SA-NEXT: s_mov_b32 exec_lo, s1 +; GCN-SA-NEXT: s_waitcnt vmcnt(0) +; GCN-SA-NEXT: v_mov_b32_e32 v0, v20 +; GCN-SA-NEXT: v_mov_b32_e32 v1, v21 +; GCN-SA-NEXT: v_mov_b32_e32 v2, v22 +; GCN-SA-NEXT: v_mov_b32_e32 v3, v23 +; GCN-SA-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -138,6 +248,40 @@ ; GCN-NEXT: v_mov_b32_e32 v2, v7 ; GCN-NEXT: v_mov_b32_e32 v3, v8 ; GCN-NEXT: ; return to shader part epilog +; +; GCN-SA-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: +; GCN-SA: ; %bb.0: +; GCN-SA-NEXT: s_mov_b32 s0, 0xffff +; GCN-SA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-SA-NEXT: v_and_b32_e32 v14, s0, v8 +; GCN-SA-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-SA-NEXT: v_and_b32_e32 v9, s0, v9 +; GCN-SA-NEXT: s_mov_b32 s1, exec_lo +; GCN-SA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-SA-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-SA-NEXT: v_and_or_b32 v5, v6, s0, v5 +; GCN-SA-NEXT: v_and_or_b32 v6, v7, s0, v14 +; GCN-SA-NEXT: v_lshl_or_b32 v7, v9, 16, v8 +; GCN-SA-NEXT: BB5_1: ; =>This Inner Loop Header: Depth=1 +; GCN-SA-NEXT: v_readfirstlane_b32 s4, v10 +; GCN-SA-NEXT: v_readfirstlane_b32 s5, v11 +; GCN-SA-NEXT: v_readfirstlane_b32 s6, v12 +; GCN-SA-NEXT: v_readfirstlane_b32 s7, v13 +; GCN-SA-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] +; GCN-SA-NEXT: image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16 +; GCN-SA-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] +; GCN-SA-NEXT: s_and_b32 s0, s0, vcc_lo +; GCN-SA-NEXT: s_and_saveexec_b32 s0, s0 +; GCN-SA-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GCN-SA-NEXT: s_cbranch_execnz BB5_1 +; GCN-SA-NEXT: ; %bb.2: +; GCN-SA-NEXT: s_mov_b32 exec_lo, s1 +; GCN-SA-NEXT: s_waitcnt vmcnt(0) +; GCN-SA-NEXT: v_mov_b32_e32 v0, v14 +; GCN-SA-NEXT: v_mov_b32_e32 v1, v15 +; GCN-SA-NEXT: v_mov_b32_e32 v2, v16 +; GCN-SA-NEXT: v_mov_b32_e32 v3, v17 +; GCN-SA-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -167,6 +311,42 @@ ; GCN-NEXT: v_mov_b32_e32 v2, v21 ; GCN-NEXT: v_mov_b32_e32 v3, v22 ; GCN-NEXT: ; return to shader part epilog +; +; GCN-SA-LABEL: image_bvh64_intersect_ray_vgpr_descr: +; GCN-SA: ; %bb.0: +; GCN-SA-NEXT: v_mov_b32_e32 v10, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-SA-NEXT: v_mov_b32_e32 v6, v7 +; GCN-SA-NEXT: v_mov_b32_e32 v7, v8 +; GCN-SA-NEXT: v_mov_b32_e32 v8, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v9, v11 +; GCN-SA-NEXT: v_mov_b32_e32 v11, v13 +; GCN-SA-NEXT: v_mov_b32_e32 v19, v15 +; GCN-SA-NEXT: v_mov_b32_e32 v20, v16 +; GCN-SA-NEXT: v_mov_b32_e32 v13, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v14, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v15, v12 +; GCN-SA-NEXT: s_mov_b32 s1, exec_lo +; GCN-SA-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 +; GCN-SA-NEXT: v_readfirstlane_b32 s4, v19 +; GCN-SA-NEXT: v_readfirstlane_b32 s5, v20 +; GCN-SA-NEXT: v_readfirstlane_b32 s6, v17 +; GCN-SA-NEXT: v_readfirstlane_b32 s7, v18 +; GCN-SA-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20] +; GCN-SA-NEXT: image_bvh64_intersect_ray v[21:24], v[0:15], s[4:7] +; GCN-SA-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] +; GCN-SA-NEXT: s_and_b32 s0, s0, vcc_lo +; GCN-SA-NEXT: s_and_saveexec_b32 s0, s0 +; GCN-SA-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GCN-SA-NEXT: s_cbranch_execnz BB6_1 +; GCN-SA-NEXT: ; %bb.2: +; GCN-SA-NEXT: s_mov_b32 exec_lo, s1 +; GCN-SA-NEXT: s_waitcnt vmcnt(0) +; GCN-SA-NEXT: v_mov_b32_e32 v0, v21 +; GCN-SA-NEXT: v_mov_b32_e32 v1, v22 +; GCN-SA-NEXT: v_mov_b32_e32 v2, v23 +; GCN-SA-NEXT: v_mov_b32_e32 v3, v24 +; GCN-SA-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -206,6 +386,51 @@ ; GCN-NEXT: v_mov_b32_e32 v2, v8 ; GCN-NEXT: v_mov_b32_e32 v3, v9 ; GCN-NEXT: ; return to shader part epilog +; +; GCN-SA-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: +; GCN-SA: ; %bb.0: +; GCN-SA-NEXT: s_mov_b32 s0, 0xffff +; GCN-SA-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GCN-SA-NEXT: v_mov_b32_e32 v16, v11 +; GCN-SA-NEXT: v_and_b32_e32 v11, s0, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v17, v12 +; GCN-SA-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GCN-SA-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-SA-NEXT: v_mov_b32_e32 v9, 0 +; GCN-SA-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-SA-NEXT: v_and_b32_e32 v10, s0, v10 +; GCN-SA-NEXT: v_mov_b32_e32 v18, v13 +; GCN-SA-NEXT: v_mov_b32_e32 v19, v14 +; GCN-SA-NEXT: v_and_or_b32 v6, v7, s0, v6 +; GCN-SA-NEXT: v_and_or_b32 v7, v8, s0, v11 +; GCN-SA-NEXT: v_lshl_or_b32 v8, v10, 16, v12 +; GCN-SA-NEXT: v_mov_b32_e32 v10, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v11, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v12, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v13, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v14, v9 +; GCN-SA-NEXT: v_mov_b32_e32 v15, v9 +; GCN-SA-NEXT: s_mov_b32 s1, exec_lo +; GCN-SA-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 +; GCN-SA-NEXT: v_readfirstlane_b32 s4, v16 +; GCN-SA-NEXT: v_readfirstlane_b32 s5, v17 +; GCN-SA-NEXT: v_readfirstlane_b32 s6, v18 +; GCN-SA-NEXT: v_readfirstlane_b32 s7, v19 +; GCN-SA-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] +; GCN-SA-NEXT: image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7] a16 +; GCN-SA-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19] +; GCN-SA-NEXT: s_and_b32 s0, s0, vcc_lo +; GCN-SA-NEXT: s_and_saveexec_b32 s0, s0 +; GCN-SA-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GCN-SA-NEXT: s_cbranch_execnz BB7_1 +; GCN-SA-NEXT: ; %bb.2: +; GCN-SA-NEXT: s_mov_b32 exec_lo, s1 +; GCN-SA-NEXT: s_waitcnt vmcnt(0) +; GCN-SA-NEXT: v_mov_b32_e32 v0, v20 +; GCN-SA-NEXT: v_mov_b32_e32 v1, v21 +; GCN-SA-NEXT: v_mov_b32_e32 v2, v22 +; GCN-SA-NEXT: v_mov_b32_e32 v3, v23 +; GCN-SA-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-disable-nsa-for-bvh < %s | FileCheck -check-prefix=GCN %s ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)