Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -143,6 +143,7 @@
   bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
   bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
   bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;
+  bool selectBVHIntrinsic(MachineInstr &I) const;
 
   std::pair<Register, unsigned>
   selectVOP3ModsImpl(MachineOperand &Root) const;
Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1746,6 +1746,8 @@
     return selectSBarrier(I);
   case Intrinsic::amdgcn_global_atomic_fadd:
     return selectGlobalAtomicFaddIntrinsic(I);
+  case Intrinsic::amdgcn_image_bvh_intersect_ray:
+    return selectBVHIntrinsic(I);
   default: {
     return selectImpl(I, *CoverageInfo);
   }
@@ -3019,6 +3021,74 @@
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
+bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register NodePtr = MI.getOperand(2).getReg();
+  Register RayExtent = MI.getOperand(3).getReg();
+  Register RayOrigin = MI.getOperand(4).getReg();
+  Register RayDir = MI.getOperand(5).getReg();
+  Register RayInvDir = MI.getOperand(6).getReg();
+  Register TDescr = MI.getOperand(7).getReg();
+
+  bool IsA16 = MRI->getType(RayDir).getElementType().getSizeInBits() == 16;
+  bool Is64 =  MRI->getType(NodePtr).getSizeInBits() == 64;
+  unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
+                                 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
+                          : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
+                                 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode), DstReg);
+  if (Is64) {
+    MIB.addReg(NodePtr, 0, AMDGPU::sub0)
+       .addReg(NodePtr, 0, AMDGPU::sub1);
+  } else {
+    MIB.addReg(NodePtr);
+  }
+
+  MIB.addReg(RayExtent);
+
+  auto packLanes = [&MIB] (Register Src) {
+    MIB.addReg(Src, 0, AMDGPU::sub0);
+    MIB.addReg(Src, 0, AMDGPU::sub1);
+    MIB.addReg(Src, 0, AMDGPU::sub2);
+  };
+
+  packLanes(RayOrigin);
+  if (IsA16) {
+    MIB.addReg(RayDir, 0, AMDGPU::sub0);
+    Register R1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register R2 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::V_PACK_B32_F16), R1)
+      .addImm(0)
+      .addReg(RayDir, 0, AMDGPU::sub1)
+      .addImm(0)
+      .addReg(RayInvDir, 0, AMDGPU::sub0)
+      .addImm(0)
+      .addImm(0);
+    BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), R2)
+      .addReg(RayInvDir, 0, AMDGPU::sub1)
+      .addReg(RayInvDir, 0, AMDGPU::sub0)
+      .addImm(16);
+    MIB.addReg(R1);
+    MIB.addReg(R2);
+  } else {
+    packLanes(RayDir);
+    packLanes(RayInvDir);
+  }
+
+  MIB.addReg(TDescr);
+  if (IsA16)
+    MIB.addImm(1);
+
+  MIB.cloneMemRefs(MI);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   if (I.isPHI())
     return selectPHI(I);
Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3094,6 +3094,10 @@
       constrainOpWithReadfirstlane(MI, MRI, 2);
       return;
     }
+    case Intrinsic::amdgcn_image_bvh_intersect_ray: {
+      executeInWaterfallLoop(MI, MRI, { 7 });
+      return;
+    }
     default: {
       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4377,6 +4381,18 @@
       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
+    case Intrinsic::amdgcn_image_bvh_intersect_ray: {
+      unsigned PtrSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
+      unsigned DirSize = getSizeInBits(MI.getOperand(5).getReg(), MRI, *TRI);
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
+      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DirSize);
+      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DirSize);
+      OpdsMapping[7] = getSGPROpMapping(MI.getOperand(7).getReg(), MRI, *TRI);
+      break;
+    }
     default:
       return getInvalidInstructionMapping();
     }
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
+
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
+
+define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
+; GCN-LABEL: image_bvh_intersect_ray:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
+; GCN-NEXT:    ; implicit-def: $vcc_hi
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
+; GCN-LABEL: image_bvh_intersect_ray_a16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pack_b32_f16 v5, v7, v8
+; GCN-NEXT:    v_alignbit_b32 v7, v9, v8, 16
+; GCN-NEXT:    ; implicit-def: $vcc_hi
+; GCN-NEXT:    image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v5, v7], s[0:3] a16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
+; GCN-LABEL: image_bvh64_intersect_ray:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
+; GCN-NEXT:    ; implicit-def: $vcc_hi
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
+; GCN-LABEL: image_bvh64_intersect_ray_a16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pack_b32_f16 v6, v8, v9
+; GCN-NEXT:    v_alignbit_b32 v8, v10, v9, 16
+; GCN-NEXT:    ; implicit-def: $vcc_hi
+; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v6, v8], s[0:3] a16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
+; GCN-LABEL: image_bvh_intersect_ray_vgpr_descr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s1, exec_lo
+; GCN-NEXT:    ; implicit-def: $vcc_hi
+; GCN-NEXT:  BB4_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s4, v14
+; GCN-NEXT:    v_readfirstlane_b32 s5, v15
+; GCN-NEXT:    v_readfirstlane_b32 s6, v16
+; GCN-NEXT:    v_readfirstlane_b32 s7, v17
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
+; GCN-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GCN-NEXT:    s_and_saveexec_b32 s0, s0
+; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_execnz BB4_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b32 exec_lo, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, v18
+; GCN-NEXT:    v_mov_b32_e32 v1, v19
+; GCN-NEXT:    v_mov_b32_e32 v2, v20
+; GCN-NEXT:    v_mov_b32_e32 v3, v21
+; GCN-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
+; GCN-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s1, exec_lo
+; GCN-NEXT:    ; implicit-def: $vcc_hi
+; GCN-NEXT:  BB5_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s4, v10
+; GCN-NEXT:    v_readfirstlane_b32 s5, v11
+; GCN-NEXT:    v_readfirstlane_b32 s6, v12
+; GCN-NEXT:    v_readfirstlane_b32 s7, v13
+; GCN-NEXT:    v_pack_b32_f16 v5, v7, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v14, v9, v8, 16
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    image_bvh_intersect_ray v[14:17], [v0, v1, v2, v3, v4, v6, v5, v14], s[4:7] a16
+; GCN-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GCN-NEXT:    s_and_saveexec_b32 s0, s0
+; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_execnz BB5_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b32 exec_lo, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
+; GCN-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
+; GCN-LABEL: image_bvh64_intersect_ray_vgpr_descr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s1, exec_lo
+; GCN-NEXT:    ; implicit-def: $vcc_hi
+; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s4, v15
+; GCN-NEXT:    v_readfirstlane_b32 s5, v16
+; GCN-NEXT:    v_readfirstlane_b32 s6, v17
+; GCN-NEXT:    v_readfirstlane_b32 s7, v18
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
+; GCN-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GCN-NEXT:    s_and_saveexec_b32 s0, s0
+; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_execnz BB6_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b32 exec_lo, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, v19
+; GCN-NEXT:    v_mov_b32_e32 v1, v20
+; GCN-NEXT:    v_mov_b32_e32 v2, v21
+; GCN-NEXT:    v_mov_b32_e32 v3, v22
+; GCN-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
+; GCN-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s1, exec_lo
+; GCN-NEXT:    ; implicit-def: $vcc_hi
+; GCN-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s4, v11
+; GCN-NEXT:    v_readfirstlane_b32 s5, v12
+; GCN-NEXT:    v_readfirstlane_b32 s6, v13
+; GCN-NEXT:    v_readfirstlane_b32 s7, v14
+; GCN-NEXT:    v_pack_b32_f16 v6, v8, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v15, v10, v9, 16
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    image_bvh64_intersect_ray v[15:18], [v0, v1, v2, v3, v4, v5, v7, v6, v15], s[4:7] a16
+; GCN-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GCN-NEXT:    s_and_saveexec_b32 s0, s0
+; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_execnz BB7_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b32 exec_lo, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, v15
+; GCN-NEXT:    v_mov_b32_e32 v1, v16
+; GCN-NEXT:    v_mov_b32_e32 v2, v17
+; GCN-NEXT:    v_mov_b32_e32 v3, v18
+; GCN-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}