diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4757,14 +4757,22 @@
     return false;
   }
 
-  bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
-  bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
-  unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
-                                 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
-                          : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
-                                 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
-
-  SmallVector<Register, 12> Ops;
+  const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
+  const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
+  const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
+  const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
+  const unsigned Opcodes[2][2][2] = {
+      {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
+        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
+       {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
+        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
+      {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
+        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
+       {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
+        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
+  const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];
+
+  SmallVector<Register, 16> Ops;
   if (Is64) {
     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
     Ops.push_back(Unmerge.getReg(0));
@@ -4799,6 +4807,23 @@
     packLanes(RayInvDir);
   }
 
+  if (!UseNSA) {
+    // Build a single vector containing all the operands so far prepared.
+    const unsigned LaneCount = NumVAddrs <= 8 ? 8 : 16;
+
+    while (Ops.size() < LaneCount) {
+      Register R = MRI.createGenericVirtualRegister(S32);
+      B.buildConstant(R, 0);
+      Ops.push_back(R);
+    }
+
+    LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
+    Register MergedOps = MRI.createGenericVirtualRegister(OpTy);
+    B.buildMerge(MergedOps, Ops);
+    Ops.clear();
+    Ops.push_back(MergedOps);
+  }
+
   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
     .addDef(DstReg)
     .addImm(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4254,8 +4254,14 @@
     unsigned N = MI.getNumExplicitOperands() - 2;
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
-    for (unsigned I = 2; I < N; ++I)
-      OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+    if (N == 3) {
+      // Sequential form: all operands combined into VGPR256/VGPR512
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 512);
+    } else {
+      // NSA form
+      for (unsigned I = 2; I < N; ++I)
+        OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+    }
     break;
   }
   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7336,7 +7336,6 @@
                                    Op->getVTList(), Ops, VT, M->getMemOperand());
   }
   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
-    SDLoc DL(Op);
     MemSDNode *M = cast<MemSDNode>(Op);
     SDValue NodePtr = M->getOperand(2);
     SDValue RayExtent = M->getOperand(3);
@@ -7355,12 +7354,21 @@
       return SDValue();
     }
 
-    bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
-    bool Is64 = NodePtr.getValueType() == MVT::i64;
-    unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
-                                   : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
-                            : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
-                                   : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+    const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
+    const bool Is64 = NodePtr.getValueType() == MVT::i64;
+    const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
+    const bool UseNSA =
+        Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
+    const unsigned Opcodes[2][2][2] = {
+        {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
+          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
+         {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
+          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
+        {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
+          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
+         {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
+          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
+    const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];
 
     SmallVector<SDValue, 16> Ops;
 
@@ -7400,6 +7408,19 @@
     packLanes(RayOrigin, true);
     packLanes(RayDir, true);
     packLanes(RayInvDir, false);
+
+    if (!UseNSA) {
+      // Build a single vector containing all the operands so far prepared.
+      const unsigned LaneCount = NumVAddrs <= 8 ? 8 : 16;
+      while (Ops.size() < LaneCount)
+        Ops.push_back(DAG.getConstant(0, DL, MVT::i32));
+
+      SDValue MergedOps = DAG.getBuildVector(
+          Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops);
+      Ops.clear();
+      Ops.push_back(MergedOps);
+    }
+
     Ops.push_back(TDescr);
     if (IsA16)
       Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s
 ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
 
 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
@@ -14,11 +14,28 @@
 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
 
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
-; GCN-LABEL: image_bvh_intersect_ray:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ; return to shader part epilog
+; GFX1030-LABEL: image_bvh_intersect_ray:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh_intersect_ray:
+; GFX1013:       ; %bb.0:
+; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v11, 0
+; GFX1013-NEXT:    v_mov_b32_e32 v5, v6
+; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
+; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
+; GFX1013-NEXT:    v_mov_b32_e32 v8, v10
+; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v12, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v13, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v14, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v15, v11
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1013-NEXT:    s_waitcnt vmcnt(0)
+; GFX1013-NEXT:    ; return to shader part epilog
 ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
@@ -47,168 +64,359 @@
 }
 
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
-; GCN-LABEL: image_bvh64_intersect_ray:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ; return to shader part epilog
+; GFX1030-LABEL: image_bvh64_intersect_ray:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray:
+; GFX1013:       ; %bb.0:
+; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
+; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
+; GFX1013-NEXT:    v_mov_b32_e32 v8, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v11, v13
+; GFX1013-NEXT:    v_mov_b32_e32 v13, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v14, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v15, v12
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GFX1013-NEXT:    s_waitcnt vmcnt(0)
+; GFX1013-NEXT:    ; return to shader part epilog
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
   ret <4 x float> %r
 }
 
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
-; GCN-LABEL: image_bvh64_intersect_ray_a16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s4, 0xffff
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v11, s4, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v10, s4, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_and_or_b32 v6, v7, s4, v6
-; GCN-NEXT:    v_and_or_b32 v7, v8, s4, v11
-; GCN-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
-; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ; return to shader part epilog
+; GFX1030-LABEL: image_bvh64_intersect_ray_a16:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_mov_b32 s4, 0xffff
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX1030-NEXT:    v_and_b32_e32 v11, s4, v9
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX1030-NEXT:    v_and_b32_e32 v10, s4, v10
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX1030-NEXT:    v_and_or_b32 v6, v7, s4, v6
+; GFX1030-NEXT:    v_and_or_b32 v7, v8, s4, v11
+; GFX1030-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray_a16:
+; GFX1013:       ; %bb.0:
+; GFX1013-NEXT:    s_mov_b32 s4, 0xffff
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX1013-NEXT:    v_and_b32_e32 v11, s4, v9
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v9, 0
+; GFX1013-NEXT:    v_and_b32_e32 v10, s4, v10
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v14, v9
+; GFX1013-NEXT:    v_and_or_b32 v6, v7, s4, v6
+; GFX1013-NEXT:    v_and_or_b32 v7, v8, s4, v11
+; GFX1013-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v10, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v12, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v15, v9
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
+; GFX1013-NEXT:    s_waitcnt vmcnt(0)
+; GFX1013-NEXT:    ; return to shader part epilog
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
   ret <4 x float> %r
 }
 
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
-; GCN-LABEL: image_bvh_intersect_ray_vgpr_descr:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s1, exec_lo
-; GCN-NEXT:  BB4_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v14
-; GCN-NEXT:    v_readfirstlane_b32 s5, v15
-; GCN-NEXT:    v_readfirstlane_b32 s6, v16
-; GCN-NEXT:    v_readfirstlane_b32 s7, v17
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
-; GCN-NEXT:    image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
-; GCN-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
-; GCN-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GCN-NEXT:    s_and_saveexec_b32 s0, s0
-; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GCN-NEXT:    s_cbranch_execnz BB4_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b32 exec_lo, s1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, v18
-; GCN-NEXT:    v_mov_b32_e32 v1, v19
-; GCN-NEXT:    v_mov_b32_e32 v2, v20
-; GCN-NEXT:    v_mov_b32_e32 v3, v21
-; GCN-NEXT:    ; return to shader part epilog
+; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1030-NEXT:  BB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v14
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v15
+; GFX1030-NEXT:    v_readfirstlane_b32 s6, v16
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v17
+; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
+; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
+; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT:    s_cbranch_execnz BB4_1
+; GFX1030-NEXT:  ; %bb.2:
+; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, v18
+; GFX1030-NEXT:    v_mov_b32_e32 v1, v19
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v20
+; GFX1030-NEXT:    v_mov_b32_e32 v3, v21
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
+; GFX1013:       ; %bb.0:
+; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v11, 0
+; GFX1013-NEXT:    v_mov_b32_e32 v5, v6
+; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
+; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
+; GFX1013-NEXT:    v_mov_b32_e32 v8, v10
+; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v18, v14
+; GFX1013-NEXT:    v_mov_b32_e32 v19, v15
+; GFX1013-NEXT:    v_mov_b32_e32 v12, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v13, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v14, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v15, v11
+; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1013-NEXT:  BB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v18
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v19
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v16
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v17
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[20:23], v[0:15], s[4:7]
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
+; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT:    s_cbranch_execnz BB4_1
+; GFX1013-NEXT:  ; %bb.2:
+; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1013-NEXT:    s_waitcnt vmcnt(0)
+; GFX1013-NEXT:    v_mov_b32_e32 v0, v20
+; GFX1013-NEXT:    v_mov_b32_e32 v1, v21
+; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
+; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
+; GFX1013-NEXT:    ; return to shader part epilog
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
   ret <4 x float> %r
 }
 
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
-; GCN-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s0, 0xffff
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v14, s0, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_and_b32_e32 v15, s0, v9
-; GCN-NEXT:    s_mov_b32 s1, exec_lo
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GCN-NEXT:    v_lshl_or_b32 v15, v15, 16, v8
-; GCN-NEXT:    v_and_or_b32 v9, v6, s0, v5
-; GCN-NEXT:    v_and_or_b32 v14, v7, s0, v14
-; GCN-NEXT:  BB5_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v10
-; GCN-NEXT:    v_readfirstlane_b32 s5, v11
-; GCN-NEXT:    v_readfirstlane_b32 s6, v12
-; GCN-NEXT:    v_readfirstlane_b32 s7, v13
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
-; GCN-NEXT:    image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16
-; GCN-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
-; GCN-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GCN-NEXT:    s_and_saveexec_b32 s0, s0
-; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GCN-NEXT:    s_cbranch_execnz BB5_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b32 exec_lo, s1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, v5
-; GCN-NEXT:    v_mov_b32_e32 v1, v6
-; GCN-NEXT:    v_mov_b32_e32 v2, v7
-; GCN-NEXT:    v_mov_b32_e32 v3, v8
-; GCN-NEXT:    ; return to shader part epilog
+; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX1030-NEXT:    v_and_b32_e32 v14, s0, v8
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX1030-NEXT:    v_and_b32_e32 v15, s0, v9
+; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX1030-NEXT:    v_lshl_or_b32 v15, v15, 16, v8
+; GFX1030-NEXT:    v_and_or_b32 v9, v6, s0, v5
+; GFX1030-NEXT:    v_and_or_b32 v14, v7, s0, v14
+; GFX1030-NEXT:  BB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v10
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v11
+; GFX1030-NEXT:    v_readfirstlane_b32 s6, v12
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v13
+; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16
+; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
+; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT:    s_cbranch_execnz BB5_1
+; GFX1030-NEXT:  ; %bb.2:
+; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1030-NEXT:    v_mov_b32_e32 v1, v6
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v7
+; GFX1030-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
+; GFX1013:       ; %bb.0:
+; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX1013-NEXT:    v_and_b32_e32 v14, s0, v8
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX1013-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX1013-NEXT:    v_and_or_b32 v5, v6, s0, v5
+; GFX1013-NEXT:    v_and_or_b32 v6, v7, s0, v14
+; GFX1013-NEXT:    v_lshl_or_b32 v7, v9, 16, v8
+; GFX1013-NEXT:  BB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v10
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v11
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v12
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v13
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
+; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT:    s_cbranch_execnz BB5_1
+; GFX1013-NEXT:  ; %bb.2:
+; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1013-NEXT:    s_waitcnt vmcnt(0)
+; GFX1013-NEXT:    v_mov_b32_e32 v0, v14
+; GFX1013-NEXT:    v_mov_b32_e32 v1, v15
+; GFX1013-NEXT:    v_mov_b32_e32 v2, v16
+; GFX1013-NEXT:    v_mov_b32_e32 v3, v17
+; GFX1013-NEXT:    ; return to shader part epilog
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
   ret <4 x float> %r
 }
 
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
-; GCN-LABEL: image_bvh64_intersect_ray_vgpr_descr:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s1, exec_lo
-; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v15
-; GCN-NEXT:    v_readfirstlane_b32 s5, v16
-; GCN-NEXT:    v_readfirstlane_b32 s6, v17
-; GCN-NEXT:    v_readfirstlane_b32 s7, v18
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
-; GCN-NEXT:    image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
-; GCN-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
-; GCN-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GCN-NEXT:    s_and_saveexec_b32 s0, s0
-; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GCN-NEXT:    s_cbranch_execnz BB6_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b32 exec_lo, s1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, v19
-; GCN-NEXT:    v_mov_b32_e32 v1, v20
-; GCN-NEXT:    v_mov_b32_e32 v2, v21
-; GCN-NEXT:    v_mov_b32_e32 v3, v22
-; GCN-NEXT:    ; return to shader part epilog
+; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1030-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v15
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v16
+; GFX1030-NEXT:    v_readfirstlane_b32 s6, v17
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v18
+; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
+; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
+; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT:    s_cbranch_execnz BB6_1
+; GFX1030-NEXT:  ; %bb.2:
+; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, v19
+; GFX1030-NEXT:    v_mov_b32_e32 v1, v20
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v21
+; GFX1030-NEXT:    v_mov_b32_e32 v3, v22
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
+; GFX1013:       ; %bb.0:
+; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
+; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
+; GFX1013-NEXT:    v_mov_b32_e32 v8, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v11, v13
+; GFX1013-NEXT:    v_mov_b32_e32 v19, v15
+; GFX1013-NEXT:    v_mov_b32_e32 v20, v16
+; GFX1013-NEXT:    v_mov_b32_e32 v13, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v14, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v15, v12
+; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1013-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v19
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v20
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v17
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v18
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20]
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[21:24], v[0:15], s[4:7]
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
+; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT:    s_cbranch_execnz BB6_1
+; GFX1013-NEXT:  ; %bb.2:
+; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1013-NEXT:    s_waitcnt vmcnt(0)
+; GFX1013-NEXT:    v_mov_b32_e32 v0, v21
+; GFX1013-NEXT:    v_mov_b32_e32 v1, v22
+; GFX1013-NEXT:    v_mov_b32_e32 v2, v23
+; GFX1013-NEXT:    v_mov_b32_e32 v3, v24
+; GFX1013-NEXT:    ; return to shader part epilog
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
   ret <4 x float> %r
 }
 
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
-; GCN-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s0, 0xffff
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v15, s0, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v16, s0, v10
-; GCN-NEXT:    s_mov_b32 s1, exec_lo
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_lshl_or_b32 v16, v16, 16, v9
-; GCN-NEXT:    v_and_or_b32 v10, v7, s0, v6
-; GCN-NEXT:    v_and_or_b32 v15, v8, s0, v15
-; GCN-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s4, v11
-; GCN-NEXT:    v_readfirstlane_b32 s5, v12
-; GCN-NEXT:    v_readfirstlane_b32 s6, v13
-; GCN-NEXT:    v_readfirstlane_b32 s7, v14
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
-; GCN-NEXT:    image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16
-; GCN-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
-; GCN-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GCN-NEXT:    s_and_saveexec_b32 s0, s0
-; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GCN-NEXT:    s_cbranch_execnz BB7_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b32 exec_lo, s1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, v6
-; GCN-NEXT:    v_mov_b32_e32 v1, v7
-; GCN-NEXT:    v_mov_b32_e32 v2, v8
-; GCN-NEXT:    v_mov_b32_e32 v3, v9
-; GCN-NEXT:    ; return to shader part epilog
+; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX1030-NEXT:    v_and_b32_e32 v15, s0, v9
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX1030-NEXT:    v_and_b32_e32 v16, s0, v10
+; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX1030-NEXT:    v_lshl_or_b32 v16, v16, 16, v9
+; GFX1030-NEXT:    v_and_or_b32 v10, v7, s0, v6
+; GFX1030-NEXT:    v_and_or_b32 v15, v8, s0, v15
+; GFX1030-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v11
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v12
+; GFX1030-NEXT:    v_readfirstlane_b32 s6, v13
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v14
+; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16
+; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
+; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT:    s_cbranch_execnz BB7_1
+; GFX1030-NEXT:  ; %bb.2:
+; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, v6
+; GFX1030-NEXT:    v_mov_b32_e32 v1, v7
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v8
+; GFX1030-NEXT:    v_mov_b32_e32 v3, v9
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
+; GFX1013:       ; %bb.0:
+; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX1013-NEXT:    v_mov_b32_e32 v16, v11
+; GFX1013-NEXT:    v_and_b32_e32 v11, s0, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v17, v12
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX1013-NEXT:    v_mov_b32_e32 v9, 0
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX1013-NEXT:    v_and_b32_e32 v10, s0, v10
+; GFX1013-NEXT:    v_mov_b32_e32 v18, v13
+; GFX1013-NEXT:    v_mov_b32_e32 v19, v14
+; GFX1013-NEXT:    v_and_or_b32 v6, v7, s0, v6
+; GFX1013-NEXT:    v_and_or_b32 v7, v8, s0, v11
+; GFX1013-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v10, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v12, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v14, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v15, v9
+; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1013-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v18
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v19
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7] a16
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
+; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT:    s_cbranch_execnz BB7_1
+; GFX1013-NEXT:  ; %bb.2:
+; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1013-NEXT:    s_waitcnt vmcnt(0)
+; GFX1013-NEXT:    v_mov_b32_e32 v0, v20
+; GFX1013-NEXT:    v_mov_b32_e32 v1, v21
+; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
+; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
+; GFX1013-NEXT:    ; return to shader part epilog
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
   ret <4 x float> %r