diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4770,12 +4770,20 @@ return false; } - bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; - bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; - unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa - : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; + const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; + const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); + const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); + const unsigned Opcodes[2][2][2] = { + {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa}, + {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}}, + {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa}, + {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}}; + const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64]; SmallVector Ops; if (Is64) { @@ -4812,6 +4820,14 @@ packLanes(RayInvDir); } + if (!UseNSA) { + // Build a single vector containing all the operands so far prepared. + LLT OpTy = LLT::fixed_vector(Ops.size(), 32); + Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); + Ops.clear(); + Ops.push_back(MergedOps); + } + auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) .addDef(DstReg) .addImm(Opcode); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4255,8 +4255,17 @@ unsigned N = MI.getNumExplicitOperands() - 2; OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); - for (unsigned I = 2; I < N; ++I) - OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + if (N == 3) { + // Sequential form: all operands combined into VGPR256/VGPR512 + unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + if (Size > 256) + Size = 512; + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + } else { + // NSA form + for (unsigned I = 2; I < N; ++I) + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + } break; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7341,7 +7341,6 @@ Op->getVTList(), Ops, VT, M->getMemOperand()); } case Intrinsic::amdgcn_image_bvh_intersect_ray: { - SDLoc DL(Op); MemSDNode *M = cast(Op); SDValue NodePtr = M->getOperand(2); SDValue RayExtent = M->getOperand(3); @@ -7360,12 +7359,21 @@ return SDValue(); } - bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; - bool Is64 = NodePtr.getValueType() == MVT::i64; - unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa - : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; + const bool Is64 = NodePtr.getValueType() == MVT::i64; + const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); + const bool UseNSA = + Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize(); + const unsigned Opcodes[2][2][2] = { + {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa}, + {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}}, + {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa}, + {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}}; + const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64]; SmallVector Ops; @@ -7405,6 +7413,20 @@ packLanes(RayOrigin, true); packLanes(RayDir, true); packLanes(RayInvDir, false); + + if (!UseNSA) { + // Build a single vector containing all the operands so far prepared. + if (NumVAddrs > 8) { + SDValue Undef = DAG.getUNDEF(MVT::i32); + Ops.append(16 - Ops.size(), Undef); + } + assert(Ops.size() == 8 || Ops.size() == 16); + SDValue MergedOps = DAG.getBuildVector( + Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops); + Ops.clear(); + Ops.push_back(MergedOps); + } + Ops.push_back(TDescr); if (IsA16) Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1)); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -15,11 +15,23 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh_intersect_ray: -; GCN: ; %bb.0: -; GCN-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh_intersect_ray: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3] +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh_intersect_ray: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_mov_b32_e32 v5, v6 +; GFX1013-NEXT: v_mov_b32_e32 v6, v7 +; GFX1013-NEXT: v_mov_b32_e32 v7, v8 +; GFX1013-NEXT: v_mov_b32_e32 v8, v10 +; GFX1013-NEXT: v_mov_b32_e32 v9, v11 +; GFX1013-NEXT: v_mov_b32_e32 v10, v12 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: ; return to shader part epilog ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> @@ -68,11 +80,23 @@ } define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh64_intersect_ray: -; GCN: ; %bb.0: -; GCN-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh64_intersect_ray: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3] +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh64_intersect_ray: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_mov_b32_e32 v6, v7 +; GFX1013-NEXT: v_mov_b32_e32 v7, v8 +; GFX1013-NEXT: v_mov_b32_e32 v8, v9 +; GFX1013-NEXT: v_mov_b32_e32 v9, v11 +; GFX1013-NEXT: v_mov_b32_e32 v10, v12 +; GFX1013-NEXT: v_mov_b32_e32 v11, v13 +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -121,136 +145,265 @@ } define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { -; GCN-LABEL: image_bvh_intersect_ray_vgpr_descr: -; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s1, exec_lo -; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s4, v14 -; GCN-NEXT: v_readfirstlane_b32 s5, v15 -; GCN-NEXT: v_readfirstlane_b32 s6, v16 -; GCN-NEXT: v_readfirstlane_b32 s7, v17 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] -; GCN-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7] -; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] -; GCN-NEXT: s_and_b32 s0, s0, vcc_lo -; GCN-NEXT: s_and_saveexec_b32 s0, s0 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: s_cbranch_execnz BB6_1 -; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v18 -; GCN-NEXT: v_mov_b32_e32 v1, v19 -; GCN-NEXT: v_mov_b32_e32 v2, v20 -; GCN-NEXT: v_mov_b32_e32 v3, v21 -; GCN-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v14 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v15 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v16 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v17 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] +; GFX1030-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] +; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1030-NEXT: s_cbranch_execnz BB6_1 +; GFX1030-NEXT: ; %bb.2: +; GFX1030-NEXT: s_mov_b32 exec_lo, s1 +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: v_mov_b32_e32 v0, v18 +; GFX1030-NEXT: v_mov_b32_e32 v1, v19 +; GFX1030-NEXT: v_mov_b32_e32 v2, v20 +; GFX1030-NEXT: v_mov_b32_e32 v3, v21 +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_mov_b32_e32 v5, v6 +; GFX1013-NEXT: v_mov_b32_e32 v6, v7 +; GFX1013-NEXT: v_mov_b32_e32 v7, v8 +; GFX1013-NEXT: v_mov_b32_e32 v8, v10 +; GFX1013-NEXT: v_mov_b32_e32 v9, v11 +; GFX1013-NEXT: v_mov_b32_e32 v10, v12 +; GFX1013-NEXT: s_mov_b32 s1, exec_lo +; GFX1013-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v14 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v15 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v16 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v17 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] +; GFX1013-NEXT: image_bvh_intersect_ray v[18:21], v[0:15], s[4:7] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] +; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1013-NEXT: s_cbranch_execnz BB6_1 +; GFX1013-NEXT: ; %bb.2: +; GFX1013-NEXT: s_mov_b32 exec_lo, s1 +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: v_mov_b32_e32 v0, v18 +; GFX1013-NEXT: v_mov_b32_e32 v1, v19 +; GFX1013-NEXT: v_mov_b32_e32 v2, v20 +; GFX1013-NEXT: v_mov_b32_e32 v3, v21 +; GFX1013-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { -; GCN-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: -; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s0, 0xffff -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_and_b32_e32 v14, s0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v15, s0, v9 -; GCN-NEXT: s_mov_b32 s1, exec_lo -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshl_or_b32 v15, v15, 16, v8 -; GCN-NEXT: v_and_or_b32 v9, v6, s0, v5 -; GCN-NEXT: v_and_or_b32 v14, v7, s0, v14 -; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s4, v10 -; GCN-NEXT: v_readfirstlane_b32 s5, v11 -; GCN-NEXT: v_readfirstlane_b32 s6, v12 -; GCN-NEXT: v_readfirstlane_b32 s7, v13 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] -; GCN-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16 -; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] -; GCN-NEXT: s_and_b32 s0, s0, vcc_lo -; GCN-NEXT: s_and_saveexec_b32 s0, s0 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: s_cbranch_execnz BB7_1 -; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v5 -; GCN-NEXT: v_mov_b32_e32 v1, v6 -; GCN-NEXT: v_mov_b32_e32 v2, v7 -; GCN-NEXT: v_mov_b32_e32 v3, v8 -; GCN-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: s_mov_b32 s0, 0xffff +; GFX1030-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX1030-NEXT: v_and_b32_e32 v14, s0, v8 +; GFX1030-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX1030-NEXT: v_and_b32_e32 v15, s0, v9 +; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1030-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX1030-NEXT: v_lshl_or_b32 v15, v15, 16, v8 +; GFX1030-NEXT: v_and_or_b32 v9, v6, s0, v5 +; GFX1030-NEXT: v_and_or_b32 v14, v7, s0, v14 +; GFX1030-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v13 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] +; GFX1030-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16 +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] +; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1030-NEXT: s_cbranch_execnz BB7_1 +; GFX1030-NEXT: ; %bb.2: +; GFX1030-NEXT: s_mov_b32 exec_lo, s1 +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: v_mov_b32_e32 v0, v5 +; GFX1030-NEXT: v_mov_b32_e32 v1, v6 +; GFX1030-NEXT: v_mov_b32_e32 v2, v7 +; GFX1030-NEXT: v_mov_b32_e32 v3, v8 +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: s_mov_b32 s0, 0xffff +; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v14, s0, v8 +; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1013-NEXT: s_mov_b32 s1, exec_lo +; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX1013-NEXT: v_and_or_b32 v5, v6, s0, v5 +; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v14 +; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8 +; GFX1013-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v12 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v13 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] +; GFX1013-NEXT: image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16 +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] +; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1013-NEXT: s_cbranch_execnz BB7_1 +; GFX1013-NEXT: ; %bb.2: +; GFX1013-NEXT: s_mov_b32 exec_lo, s1 +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: v_mov_b32_e32 v0, v14 +; GFX1013-NEXT: v_mov_b32_e32 v1, v15 +; GFX1013-NEXT: v_mov_b32_e32 v2, v16 +; GFX1013-NEXT: v_mov_b32_e32 v3, v17 +; GFX1013-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { -; GCN-LABEL: image_bvh64_intersect_ray_vgpr_descr: -; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s1, exec_lo -; GCN-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s4, v15 -; GCN-NEXT: v_readfirstlane_b32 s5, v16 -; GCN-NEXT: v_readfirstlane_b32 s6, v17 -; GCN-NEXT: v_readfirstlane_b32 s7, v18 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16] -; GCN-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7] -; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] -; GCN-NEXT: s_and_b32 s0, s0, vcc_lo -; GCN-NEXT: s_and_saveexec_b32 s0, s0 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: s_cbranch_execnz BB8_1 -; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v19 -; GCN-NEXT: v_mov_b32_e32 v1, v20 -; GCN-NEXT: v_mov_b32_e32 v2, v21 -; GCN-NEXT: v_mov_b32_e32 v3, v22 -; GCN-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v15 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v16 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v17 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v18 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16] +; GFX1030-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] +; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1030-NEXT: s_cbranch_execnz BB8_1 +; GFX1030-NEXT: ; %bb.2: +; GFX1030-NEXT: s_mov_b32 exec_lo, s1 +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: v_mov_b32_e32 v0, v19 +; GFX1030-NEXT: v_mov_b32_e32 v1, v20 +; GFX1030-NEXT: v_mov_b32_e32 v2, v21 +; GFX1030-NEXT: v_mov_b32_e32 v3, v22 +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: v_mov_b32_e32 v6, v7 +; GFX1013-NEXT: v_mov_b32_e32 v7, v8 +; GFX1013-NEXT: v_mov_b32_e32 v8, v9 +; GFX1013-NEXT: v_mov_b32_e32 v9, v11 +; GFX1013-NEXT: v_mov_b32_e32 v10, v12 +; GFX1013-NEXT: v_mov_b32_e32 v11, v13 +; GFX1013-NEXT: s_mov_b32 s1, exec_lo +; GFX1013-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v15 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v16 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v17 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v18 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16] +; GFX1013-NEXT: image_bvh64_intersect_ray v[19:22], v[0:15], s[4:7] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] +; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1013-NEXT: s_cbranch_execnz BB8_1 +; GFX1013-NEXT: ; %bb.2: +; GFX1013-NEXT: s_mov_b32 exec_lo, s1 +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: v_mov_b32_e32 v0, v19 +; GFX1013-NEXT: v_mov_b32_e32 v1, v20 +; GFX1013-NEXT: v_mov_b32_e32 v2, v21 +; GFX1013-NEXT: v_mov_b32_e32 v3, v22 +; GFX1013-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { -; GCN-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: -; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s0, 0xffff -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v15, s0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_and_b32_e32 v16, s0, v10 -; GCN-NEXT: s_mov_b32 s1, exec_lo -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshl_or_b32 v16, v16, 16, v9 -; GCN-NEXT: v_and_or_b32 v10, v7, s0, v6 -; GCN-NEXT: v_and_or_b32 v15, v8, s0, v15 -; GCN-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s4, v11 -; GCN-NEXT: v_readfirstlane_b32 s5, v12 -; GCN-NEXT: v_readfirstlane_b32 s6, v13 -; GCN-NEXT: v_readfirstlane_b32 s7, v14 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] -; GCN-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16 -; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] -; GCN-NEXT: s_and_b32 s0, s0, vcc_lo -; GCN-NEXT: s_and_saveexec_b32 s0, s0 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: s_cbranch_execnz BB9_1 -; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b32 exec_lo, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v6 -; GCN-NEXT: v_mov_b32_e32 v1, v7 -; GCN-NEXT: v_mov_b32_e32 v2, v8 -; GCN-NEXT: v_mov_b32_e32 v3, v9 -; GCN-NEXT: ; return to shader part epilog +; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: +; GFX1030: ; %bb.0: +; GFX1030-NEXT: s_mov_b32 s0, 0xffff +; GFX1030-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX1030-NEXT: v_and_b32_e32 v15, s0, v9 +; GFX1030-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX1030-NEXT: v_and_b32_e32 v16, s0, v10 +; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX1030-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX1030-NEXT: v_lshl_or_b32 v16, v16, 16, v9 +; GFX1030-NEXT: v_and_or_b32 v10, v7, s0, v6 +; GFX1030-NEXT: v_and_or_b32 v15, v8, s0, v15 +; GFX1030-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v13 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v14 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] +; GFX1030-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16 +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] +; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1030-NEXT: s_cbranch_execnz BB9_1 +; GFX1030-NEXT: ; %bb.2: +; GFX1030-NEXT: s_mov_b32 exec_lo, s1 +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: v_mov_b32_e32 v0, v6 +; GFX1030-NEXT: v_mov_b32_e32 v1, v7 +; GFX1030-NEXT: v_mov_b32_e32 v2, v8 +; GFX1030-NEXT: v_mov_b32_e32 v3, v9 +; GFX1030-NEXT: ; return to shader part epilog +; +; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: +; GFX1013: ; %bb.0: +; GFX1013-NEXT: s_mov_b32 s0, 0xffff +; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX1013-NEXT: v_and_b32_e32 v15, s0, v9 +; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX1013-NEXT: v_and_b32_e32 v10, s0, v10 +; GFX1013-NEXT: s_mov_b32 s1, exec_lo +; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v6 +; GFX1013-NEXT: v_and_or_b32 v7, v8, s0, v15 +; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9 +; GFX1013-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX1013-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v13 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v14 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: image_bvh64_intersect_ray v[15:18], v[0:15], s[4:7] a16 +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] +; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1013-NEXT: s_cbranch_execnz BB9_1 +; GFX1013-NEXT: ; %bb.2: +; GFX1013-NEXT: s_mov_b32 exec_lo, s1 +; GFX1013-NEXT: s_waitcnt vmcnt(0) +; GFX1013-NEXT: v_mov_b32_e32 v0, v15 +; GFX1013-NEXT: v_mov_b32_e32 v1, v16 +; GFX1013-NEXT: v_mov_b32_e32 v2, v17 +; GFX1013-NEXT: v_mov_b32_e32 v3, v18 +; GFX1013-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -294,29 +447,29 @@ ; GFX1013-NEXT: s_clause 0x1 ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX1013-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0 -; GFX1013-NEXT: v_mov_b32_e32 v7, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40400000 -; GFX1013-NEXT: v_mov_b32_e32 v10, 4.0 -; GFX1013-NEXT: v_mov_b32_e32 v11, 0x40a00000 -; GFX1013-NEXT: v_mov_b32_e32 v12, 0x40c00000 -; GFX1013-NEXT: v_mov_b32_e32 v13, 0x40e00000 -; GFX1013-NEXT: v_mov_b32_e32 v14, 0x41000000 +; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 +; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 +; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s4 ; GFX1013-NEXT: v_mov_b32_e32 v1, s5 ; GFX1013-NEXT: v_mov_b32_e32 v2, s6 ; GFX1013-NEXT: v_mov_b32_e32 v3, s7 -; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 +; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 +; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1013-NEXT: flat_load_dword v4, v[0:1] -; GFX1013-NEXT: flat_load_dword v5, v[2:3] +; GFX1013-NEXT: flat_load_dword v0, v[4:5] +; GFX1013-NEXT: flat_load_dword v1, v[2:3] +; GFX1013-NEXT: v_mov_b32_e32 v2, 0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[4:19], s[8:11] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[8:11] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm @@ -392,7 +545,7 @@ ; GFX1013-NEXT: s_clause 0x1 ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX1013-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: s_movk_i32 s1, 0x4400 ; GFX1013-NEXT: s_movk_i32 s2, 0x4200 ; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -404,32 +557,32 @@ ; GFX1013-NEXT: s_or_b32 s1, s2, s1 ; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0 -; GFX1013-NEXT: v_mov_b32_e32 v7, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v9, s1 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s4 ; GFX1013-NEXT: v_mov_b32_e32 v1, s5 ; GFX1013-NEXT: v_mov_b32_e32 v2, s6 ; GFX1013-NEXT: v_mov_b32_e32 v3, s7 ; GFX1013-NEXT: s_movk_i32 s5, 0x4600 -; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: s_movk_i32 s4, 0x4700 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 ; GFX1013-NEXT: s_bfe_u32 s2, s5, 0x100000 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 ; GFX1013-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX1013-NEXT: s_or_b32 s0, s0, s2 -; GFX1013-NEXT: flat_load_dword v4, v[0:1] -; GFX1013-NEXT: flat_load_dword v5, v[2:3] +; GFX1013-NEXT: flat_load_dword v0, v[4:5] +; GFX1013-NEXT: flat_load_dword v1, v[2:3] ; GFX1013-NEXT: s_or_b32 s2, s4, s3 -; GFX1013-NEXT: v_mov_b32_e32 v10, s0 -; GFX1013-NEXT: v_mov_b32_e32 v11, s2 +; GFX1013-NEXT: v_mov_b32_e32 v2, 0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v5, s1 +; GFX1013-NEXT: v_mov_b32_e32 v6, s0 +; GFX1013-NEXT: v_mov_b32_e32 v7, s2 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[4:11], s[8:11] a16 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm