diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -230,14 +230,14 @@ //===----------------------------------------------------------------------===// // Raytracing builtins. -// By default the 1st argument is i32 and the 4/5-th arguments are float4. +// By default the 1st argument is i32 and the 4/5-th arguments are float3. // Postfix l indicates the 1st argument is i64. -// Postfix h indicates the 4/5-th arguments are half4. +// Postfix h indicates the 4/5-th arguments are half3. //===----------------------------------------------------------------------===// -TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray, "V4UiUifV4fV4fV4fV4Ui", "nc", "gfx10-insts") -TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV4fV4hV4hV4Ui", "nc", "gfx10-insts") -TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_l, "V4UiWUifV4fV4fV4fV4Ui", "nc", "gfx10-insts") -TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4hV4hV4Ui", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray, "V4UiUifV3fV3fV3fV4Ui", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV3fV3hV3hV4Ui", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_l, "V4UiWUifV3fV3fV3fV4Ui", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV3fV3hV3hV4Ui", "nc", "gfx10-insts") //===----------------------------------------------------------------------===// // Special builtins. diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -417,6 +417,12 @@ Builder.defineMacro("FP_FAST_FMA"); Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE", Twine(WavefrontSize)); + + // Temporary macro to help clients transition to the current definition of + // __builtin_amdgcn_image_bvh_intersect_ray* from the previous one which used + // vec4 arguments. + // TODO: Remove this in LLVM 15. + Builder.defineMacro("__amdgcn_bvh_use_vec3__"); } void AMDGPUTargetInfo::setAuxTarget(const TargetInfo *Aux) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl @@ -8,51 +8,50 @@ // The clang builtin functions __builtin_amdgcn_image_bvh_intersect_ray* use // postfixes to indicate the types of the 1st, 4th, and 5th arguments. -// By default, the 1st argument is i32, the 4/5-th arguments are float4. +// By default, the 1st argument is i32, the 4/5-th arguments are float3. // Postfix l indicates the 1st argument is i64 and postfix h indicates -// the 4/5-th arguments are half4. +// the 4/5-th arguments are half3. typedef unsigned int uint; typedef unsigned long ulong; -typedef float float4 __attribute__((ext_vector_type(4))); -typedef double double4 __attribute__((ext_vector_type(4))); -typedef half half4 __attribute__((ext_vector_type(4))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef half half3 __attribute__((ext_vector_type(3))); typedef uint uint4 __attribute__((ext_vector_type(4))); -// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32 +// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32 // ISA: image_bvh_intersect_ray void test_image_bvh_intersect_ray(global uint4* out, uint node_ptr, - float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, + float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) { *out = __builtin_amdgcn_image_bvh_intersect_ray(node_ptr, ray_extent, ray_origin, ray_dir, ray_inv_dir, texture_descr); } -// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16 +// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f16 // ISA: image_bvh_intersect_ray void test_image_bvh_intersect_ray_h(global uint4* out, uint node_ptr, - float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, + float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) { *out = __builtin_amdgcn_image_bvh_intersect_ray_h(node_ptr, ray_extent, ray_origin, ray_dir, ray_inv_dir, texture_descr); } -// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32 +// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v3f32 // ISA: image_bvh_intersect_ray void test_image_bvh_intersect_ray_l(global uint4* out, ulong node_ptr, - float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, + float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) { *out = __builtin_amdgcn_image_bvh_intersect_ray_l(node_ptr, ray_extent, ray_origin, ray_dir, ray_inv_dir, texture_descr); } -// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16 +// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v3f16 // ISA: image_bvh_intersect_ray void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr, - float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, + float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) { *out = __builtin_amdgcn_image_bvh_intersect_ray_lh(node_ptr, ray_extent, diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c --- a/clang/test/Preprocessor/predefined-macros.c +++ b/clang/test/Preprocessor/predefined-macros.c @@ -230,6 +230,7 @@ // CHECK-HIP: #define __HIPCC__ 1 // CHECK-HIP-NOT: #define __HIP_DEVICE_COMPILE__ 1 // CHECK-HIP: #define __HIP__ 1 +// CHECK-HIP: #define __amdgcn_bvh_use_vec3__ 1 // RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \ // RUN: -fcuda-is-device \ @@ -238,3 +239,4 @@ // CHECK-HIP-DEV: #define __HIPCC__ 1 // CHECK-HIP-DEV: #define __HIP_DEVICE_COMPILE__ 1 // CHECK-HIP-DEV: #define __HIP__ 1 +// CHECK-HIP-DEV: #define __amdgcn_bvh_use_vec3__ 1 diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1789,9 +1789,11 @@ // uint4 llvm.amdgcn.image.bvh.intersect.ray , , , // , , +// is i32 or i64. +// and are both v3f16 or both v3f32. def int_amdgcn_image_bvh_intersect_ray : Intrinsic<[llvm_v4i32_ty], - [llvm_anyint_ty, llvm_float_ty, llvm_v4f32_ty, llvm_anyvector_ty, + [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_v4i32_ty], [IntrReadMem, IntrWillReturn]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4869,8 +4869,8 @@ } Ops.push_back(RayExtent); - auto packLanes = [&Ops, &S32, &B] (Register Src) { - auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); + auto packLanes = [&Ops, &S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); Ops.push_back(Unmerge.getReg(0)); Ops.push_back(Unmerge.getReg(1)); Ops.push_back(Unmerge.getReg(2)); @@ -4878,8 +4878,8 @@ packLanes(RayOrigin); if (IsA16) { - auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); - auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); Register R1 = MRI.createGenericVirtualRegister(S32); Register R2 = MRI.createGenericVirtualRegister(S32); Register R3 = MRI.createGenericVirtualRegister(S32); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7502,8 +7502,8 @@ assert(NodePtr.getValueType() == MVT::i32 || NodePtr.getValueType() == MVT::i64); - assert(RayDir.getValueType() == MVT::v4f16 || - RayDir.getValueType() == MVT::v4f32); + assert(RayDir.getValueType() == MVT::v3f16 || + RayDir.getValueType() == MVT::v3f32); if (!Subtarget->hasGFX10_AEncoding()) { emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -3,37 +3,25 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GFX1030-LABEL: image_bvh_intersect_ray: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3] -; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: ; return to shader part epilog -; -; GFX1013-LABEL: image_bvh_intersect_ray: -; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v5, v6 -; GFX1013-NEXT: v_mov_b32_e32 v6, v7 -; GFX1013-NEXT: v_mov_b32_e32 v7, v8 -; GFX1013-NEXT: v_mov_b32_e32 v8, v10 -; GFX1013-NEXT: v_mov_b32_e32 v9, v11 -; GFX1013-NEXT: v_mov_b32_e32 v10, v12 -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] -; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: ; return to shader part epilog +define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { +; GCN-LABEL: image_bvh_intersect_ray: +; GCN: ; %bb.0: +; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -44,60 +32,48 @@ ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog - %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray_a16: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_and_b32_e32 v10, s4, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v9, s4, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GCN-NEXT: v_and_b32_e32 v10, s4, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_b32_e32 v8, s4, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_or_b32 v5, v6, s4, v5 -; GCN-NEXT: v_and_or_b32 v6, v7, s4, v10 -; GCN-NEXT: v_lshl_or_b32 v7, v9, 16, v8 +; GCN-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GCN-NEXT: v_and_or_b32 v5, v5, s4, v9 +; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10 ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GFX1030-LABEL: image_bvh64_intersect_ray: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3] -; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: ; return to shader part epilog -; -; GFX1013-LABEL: image_bvh64_intersect_ray: -; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v6, v7 -; GFX1013-NEXT: v_mov_b32_e32 v7, v8 -; GFX1013-NEXT: v_mov_b32_e32 v8, v9 -; GFX1013-NEXT: v_mov_b32_e32 v9, v11 -; GFX1013-NEXT: v_mov_b32_e32 v10, v12 -; GFX1013-NEXT: v_mov_b32_e32 v11, v13 -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] -; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { +; GCN-LABEL: image_bvh64_intersect_ray: +; GCN: ; %bb.0: +; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -109,67 +85,70 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 - %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray_a16: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v11, s4, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_and_b32_e32 v10, s4, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, s4, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_and_b32_e32 v9, s4, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_and_or_b32 v6, v7, s4, v6 -; GCN-NEXT: v_and_or_b32 v7, v8, s4, v11 -; GCN-NEXT: v_lshl_or_b32 v8, v10, 16, v9 +; GCN-NEXT: v_lshl_or_b32 v8, v9, 16, v8 +; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10 +; GCN-NEXT: v_and_or_b32 v7, v7, s4, v11 ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v18, v0 -; GFX1030-NEXT: v_mov_b32_e32 v19, v1 -; GFX1030-NEXT: v_mov_b32_e32 v20, v2 -; GFX1030-NEXT: v_mov_b32_e32 v21, v3 -; GFX1030-NEXT: v_mov_b32_e32 v22, v4 -; GFX1030-NEXT: v_mov_b32_e32 v23, v6 -; GFX1030-NEXT: v_mov_b32_e32 v24, v7 -; GFX1030-NEXT: v_mov_b32_e32 v25, v8 -; GFX1030-NEXT: v_mov_b32_e32 v26, v10 -; GFX1030-NEXT: v_mov_b32_e32 v27, v11 -; GFX1030-NEXT: v_mov_b32_e32 v28, v12 +; GFX1030-NEXT: v_mov_b32_e32 v15, v0 +; GFX1030-NEXT: v_mov_b32_e32 v16, v1 +; GFX1030-NEXT: v_mov_b32_e32 v17, v2 +; GFX1030-NEXT: v_mov_b32_e32 v18, v3 +; GFX1030-NEXT: v_mov_b32_e32 v19, v4 +; GFX1030-NEXT: v_mov_b32_e32 v20, v5 +; GFX1030-NEXT: v_mov_b32_e32 v21, v6 +; GFX1030-NEXT: v_mov_b32_e32 v22, v7 +; GFX1030-NEXT: v_mov_b32_e32 v23, v8 +; GFX1030-NEXT: v_mov_b32_e32 v24, v9 +; GFX1030-NEXT: v_mov_b32_e32 v25, v10 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v14 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v15 -; GFX1030-NEXT: v_readfirstlane_b32 s6, v16 -; GFX1030-NEXT: v_readfirstlane_b32 s7, v17 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] -; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] +; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v13 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v14 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[18:33], s[4:7] -; GFX1030-NEXT: ; implicit-def: $vgpr14_vgpr15 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:30], s[4:7] +; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1030-NEXT: ; implicit-def: $vgpr15 +; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 ; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 @@ -178,10 +157,7 @@ ; GFX1030-NEXT: ; implicit-def: $vgpr23 ; GFX1030-NEXT: ; implicit-def: $vgpr24 ; GFX1030-NEXT: ; implicit-def: $vgpr25 -; GFX1030-NEXT: ; implicit-def: $vgpr26 -; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr28 -; GFX1030-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB6_1 ; GFX1030-NEXT: ; %bb.2: @@ -191,28 +167,24 @@ ; ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v5, v6 -; GFX1013-NEXT: v_mov_b32_e32 v6, v7 -; GFX1013-NEXT: v_mov_b32_e32 v7, v8 -; GFX1013-NEXT: v_mov_b32_e32 v8, v10 -; GFX1013-NEXT: v_mov_b32_e32 v9, v11 -; GFX1013-NEXT: v_mov_b32_e32 v10, v12 -; GFX1013-NEXT: v_mov_b32_e32 v18, v14 -; GFX1013-NEXT: v_mov_b32_e32 v19, v15 +; GFX1013-NEXT: v_mov_b32_e32 v16, v11 +; GFX1013-NEXT: v_mov_b32_e32 v17, v12 +; GFX1013-NEXT: v_mov_b32_e32 v18, v13 +; GFX1013-NEXT: v_mov_b32_e32 v19, v14 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v18 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v19 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v16 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v17 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] +; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v18 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v19 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19] ; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[20:23], v[0:15], s[4:7] +; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX1013-NEXT: ; implicit-def: $vgpr18_vgpr19 -; GFX1013-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17 +; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB6_1 @@ -224,41 +196,42 @@ ; GFX1013-NEXT: v_mov_b32_e32 v2, v22 ; GFX1013-NEXT: v_mov_b32_e32 v3, v23 ; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_mov_b32 s0, 0xffff -; GFX1030-NEXT: v_mov_b32_e32 v14, v0 -; GFX1030-NEXT: v_mov_b32_e32 v15, v1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 -; GFX1030-NEXT: v_mov_b32_e32 v16, v2 -; GFX1030-NEXT: v_mov_b32_e32 v17, v3 -; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX1030-NEXT: v_mov_b32_e32 v13, v0 +; GFX1030-NEXT: v_mov_b32_e32 v14, v1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX1030-NEXT: v_and_b32_e32 v1, s0, v7 +; GFX1030-NEXT: v_mov_b32_e32 v15, v2 +; GFX1030-NEXT: v_mov_b32_e32 v16, v3 +; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v7 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9 -; GFX1030-NEXT: v_mov_b32_e32 v18, v4 +; GFX1030-NEXT: v_and_b32_e32 v3, s0, v8 +; GFX1030-NEXT: v_mov_b32_e32 v17, v4 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo -; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v20, v7, s0, v1 -; GFX1030-NEXT: v_lshl_or_b32 v21, v3, 16, v2 +; GFX1030-NEXT: v_and_or_b32 v18, v5, s0, v0 +; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v1 +; GFX1030-NEXT: v_lshl_or_b32 v20, v3, 16, v2 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 -; GFX1030-NEXT: v_readfirstlane_b32 s6, v12 -; GFX1030-NEXT: v_readfirstlane_b32 s7, v13 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] -; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] +; GFX1030-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v12 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[14:21], s[4:7] a16 -; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[13:20], s[4:7] a16 +; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1030-NEXT: ; implicit-def: $vgpr13 ; GFX1030-NEXT: ; implicit-def: $vgpr14 ; GFX1030-NEXT: ; implicit-def: $vgpr15 ; GFX1030-NEXT: ; implicit-def: $vgpr16 @@ -266,8 +239,7 @@ ; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 -; GFX1030-NEXT: ; implicit-def: $vgpr21 -; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1030-NEXT: ; %bb.2: @@ -278,72 +250,75 @@ ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_mov_b32 s0, 0xffff -; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v14, s0, v8 -; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX1013-NEXT: v_and_b32_e32 v14, s0, v7 +; GFX1013-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX1013-NEXT: v_and_b32_e32 v8, s0, v8 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX1013-NEXT: v_and_or_b32 v5, v6, s0, v5 -; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v14 -; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8 +; GFX1013-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX1013-NEXT: v_and_or_b32 v5, v5, s0, v13 +; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v14 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v12 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v13 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] +; GFX1013-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v11 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v12 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1013-NEXT: image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16 -; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16 +; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1013-NEXT: ; %bb.2: ; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, v14 -; GFX1013-NEXT: v_mov_b32_e32 v1, v15 -; GFX1013-NEXT: v_mov_b32_e32 v2, v16 -; GFX1013-NEXT: v_mov_b32_e32 v3, v17 +; GFX1013-NEXT: v_mov_b32_e32 v0, v13 +; GFX1013-NEXT: v_mov_b32_e32 v1, v14 +; GFX1013-NEXT: v_mov_b32_e32 v2, v15 +; GFX1013-NEXT: v_mov_b32_e32 v3, v16 ; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v19, v0 -; GFX1030-NEXT: v_mov_b32_e32 v20, v1 -; GFX1030-NEXT: v_mov_b32_e32 v21, v2 -; GFX1030-NEXT: v_mov_b32_e32 v22, v3 -; GFX1030-NEXT: v_mov_b32_e32 v23, v4 -; GFX1030-NEXT: v_mov_b32_e32 v24, v5 -; GFX1030-NEXT: v_mov_b32_e32 v25, v7 -; GFX1030-NEXT: v_mov_b32_e32 v26, v8 -; GFX1030-NEXT: v_mov_b32_e32 v27, v9 -; GFX1030-NEXT: v_mov_b32_e32 v28, v11 -; GFX1030-NEXT: v_mov_b32_e32 v29, v12 -; GFX1030-NEXT: v_mov_b32_e32 v30, v13 +; GFX1030-NEXT: v_mov_b32_e32 v16, v0 +; GFX1030-NEXT: v_mov_b32_e32 v17, v1 +; GFX1030-NEXT: v_mov_b32_e32 v18, v2 +; GFX1030-NEXT: v_mov_b32_e32 v19, v3 +; GFX1030-NEXT: v_mov_b32_e32 v20, v4 +; GFX1030-NEXT: v_mov_b32_e32 v21, v5 +; GFX1030-NEXT: v_mov_b32_e32 v22, v6 +; GFX1030-NEXT: v_mov_b32_e32 v23, v7 +; GFX1030-NEXT: v_mov_b32_e32 v24, v8 +; GFX1030-NEXT: v_mov_b32_e32 v25, v9 +; GFX1030-NEXT: v_mov_b32_e32 v26, v10 +; GFX1030-NEXT: v_mov_b32_e32 v27, v11 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v15 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v16 -; GFX1030-NEXT: v_readfirstlane_b32 s6, v17 -; GFX1030-NEXT: v_readfirstlane_b32 s7, v18 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16] -; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] +; GFX1030-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v13 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v14 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v15 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[19:34], s[4:7] -; GFX1030-NEXT: ; implicit-def: $vgpr15_vgpr16 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:31], s[4:7] +; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 @@ -353,10 +328,7 @@ ; GFX1030-NEXT: ; implicit-def: $vgpr25 ; GFX1030-NEXT: ; implicit-def: $vgpr26 ; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr28 -; GFX1030-NEXT: ; implicit-def: $vgpr29 -; GFX1030-NEXT: ; implicit-def: $vgpr30 -; GFX1030-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18 +; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB8_1 ; GFX1030-NEXT: ; %bb.2: @@ -366,75 +338,72 @@ ; ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v6, v7 -; GFX1013-NEXT: v_mov_b32_e32 v7, v8 -; GFX1013-NEXT: v_mov_b32_e32 v8, v9 -; GFX1013-NEXT: v_mov_b32_e32 v9, v11 -; GFX1013-NEXT: v_mov_b32_e32 v10, v12 -; GFX1013-NEXT: v_mov_b32_e32 v11, v13 +; GFX1013-NEXT: v_mov_b32_e32 v16, v12 +; GFX1013-NEXT: v_mov_b32_e32 v17, v13 +; GFX1013-NEXT: v_mov_b32_e32 v18, v14 ; GFX1013-NEXT: v_mov_b32_e32 v19, v15 -; GFX1013-NEXT: v_mov_b32_e32 v20, v16 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v19 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v20 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v17 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v18 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] +; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v18 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v19 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19] ; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1013-NEXT: image_bvh64_intersect_ray v[21:24], v[0:15], s[4:7] +; GFX1013-NEXT: image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7] +; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX1013-NEXT: ; implicit-def: $vgpr19_vgpr20 -; GFX1013-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18 +; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB8_1 ; GFX1013-NEXT: ; %bb.2: ; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, v21 -; GFX1013-NEXT: v_mov_b32_e32 v1, v22 -; GFX1013-NEXT: v_mov_b32_e32 v2, v23 -; GFX1013-NEXT: v_mov_b32_e32 v3, v24 +; GFX1013-NEXT: v_mov_b32_e32 v0, v20 +; GFX1013-NEXT: v_mov_b32_e32 v1, v21 +; GFX1013-NEXT: v_mov_b32_e32 v2, v22 +; GFX1013-NEXT: v_mov_b32_e32 v3, v23 ; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_mov_b32 s0, 0xffff -; GFX1030-NEXT: v_mov_b32_e32 v15, v0 -; GFX1030-NEXT: v_mov_b32_e32 v16, v1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v9 -; GFX1030-NEXT: v_mov_b32_e32 v17, v2 -; GFX1030-NEXT: v_mov_b32_e32 v18, v3 -; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX1030-NEXT: v_mov_b32_e32 v14, v0 +; GFX1030-NEXT: v_mov_b32_e32 v15, v1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 +; GFX1030-NEXT: v_mov_b32_e32 v16, v2 +; GFX1030-NEXT: v_mov_b32_e32 v17, v3 +; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_and_b32_e32 v3, s0, v10 -; GFX1030-NEXT: v_mov_b32_e32 v19, v4 -; GFX1030-NEXT: v_mov_b32_e32 v20, v5 -; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v22, v8, s0, v1 -; GFX1030-NEXT: v_lshl_or_b32 v23, v3, 16, v2 +; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9 +; GFX1030-NEXT: v_mov_b32_e32 v18, v4 +; GFX1030-NEXT: v_mov_b32_e32 v19, v5 +; GFX1030-NEXT: v_and_or_b32 v20, v6, s0, v0 +; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v1 +; GFX1030-NEXT: v_lshl_or_b32 v22, v3, 16, v2 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 -; GFX1030-NEXT: v_readfirstlane_b32 s6, v13 -; GFX1030-NEXT: v_readfirstlane_b32 s7, v14 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] -; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] +; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v13 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[15:30], s[4:7] a16 -; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:29], s[4:7] a16 +; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1030-NEXT: ; implicit-def: $vgpr14 ; GFX1030-NEXT: ; implicit-def: $vgpr15 ; GFX1030-NEXT: ; implicit-def: $vgpr16 ; GFX1030-NEXT: ; implicit-def: $vgpr17 @@ -443,8 +412,7 @@ ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 -; GFX1030-NEXT: ; implicit-def: $vgpr23 -; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB9_1 ; GFX1030-NEXT: ; %bb.2: @@ -455,20 +423,20 @@ ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_mov_b32 s0, 0xffff -; GFX1013-NEXT: v_mov_b32_e32 v16, v11 -; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX1013-NEXT: v_and_b32_e32 v11, s0, v9 -; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX1013-NEXT: v_and_b32_e32 v10, s0, v10 -; GFX1013-NEXT: v_mov_b32_e32 v17, v12 -; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX1013-NEXT: v_mov_b32_e32 v16, v10 +; GFX1013-NEXT: v_mov_b32_e32 v17, v11 +; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v11, s0, v8 +; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1013-NEXT: v_mov_b32_e32 v18, v12 +; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX1013-NEXT: v_mov_b32_e32 v18, v13 -; GFX1013-NEXT: v_mov_b32_e32 v19, v14 +; GFX1013-NEXT: v_mov_b32_e32 v19, v13 +; GFX1013-NEXT: v_lshl_or_b32 v8, v9, 16, v8 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v6 -; GFX1013-NEXT: v_and_or_b32 v7, v8, s0, v11 -; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9 +; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v10 +; GFX1013-NEXT: v_and_or_b32 v7, v7, s0, v11 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 @@ -493,7 +461,7 @@ ; GFX1013-NEXT: v_mov_b32_e32 v2, v22 ; GFX1013-NEXT: v_mov_b32_e32 v3, v23 ; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -567,16 +535,16 @@ %node_ptr = load i32, i32* %gep_node_ptr, align 4 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -680,16 +648,16 @@ %node_ptr = load i32, i32* %gep_node_ptr, align 4 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 - %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 - %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 - %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -755,16 +723,16 @@ %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -860,16 +828,16 @@ %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 - %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 - %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 - %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -3,15 +3,15 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s ; RUN: not --crash llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget ; Arguments are flattened to represent the actual VGPR_A layout, so we have no @@ -23,43 +23,43 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: - %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray_a16: ; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_lshr_b32 s5, s8, 16 -; GCN-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GCN-NEXT: s_pack_ll_b32_b16 s5, s5, s9 +; GCN-NEXT: s_mov_b32 s15, s12 +; GCN-NEXT: s_mov_b32 s12, s9 +; GCN-NEXT: s_lshr_b32 s9, s7, 16 +; GCN-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GCN-NEXT: s_pack_ll_b32_b16 s7, s9, s8 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NEXT: v_mov_b32_e32 v7, s5 -; GCN-NEXT: s_mov_b32 s15, s13 -; GCN-NEXT: s_mov_b32 s14, s12 -; GCN-NEXT: s_mov_b32 s13, s11 -; GCN-NEXT: s_mov_b32 s12, s10 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: s_mov_b32 s14, s11 +; GCN-NEXT: s_mov_b32 s13, s10 ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -74,44 +74,44 @@ ; GCN-NEXT: ; return to shader part epilog main_body: %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 - %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray_a16: ; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_lshr_b32 s6, s9, 16 -; GCN-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GCN-NEXT: s_pack_ll_b32_b16 s6, s6, s10 +; GCN-NEXT: s_mov_b32 s14, s12 +; GCN-NEXT: s_mov_b32 s12, s10 +; GCN-NEXT: s_lshr_b32 s10, s8, 16 +; GCN-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GCN-NEXT: s_pack_ll_b32_b16 s8, s10, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NEXT: v_mov_b32_e32 v7, s8 -; GCN-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_mov_b32 s14, s13 -; GCN-NEXT: s_mov_b32 s13, s12 -; GCN-NEXT: s_mov_b32 s12, s11 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NEXT: s_mov_b32 s15, s13 +; GCN-NEXT: s_mov_b32 s13, s11 ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -178,16 +178,16 @@ %node_ptr = load i32, i32* %gep_node_ptr, align 4 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -246,16 +246,16 @@ %node_ptr = load i32, i32* %gep_node_ptr, align 4 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 - %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 - %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 - %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -316,16 +316,16 @@ %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -380,16 +380,16 @@ %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 - %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 - %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 - %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void }