Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -143,6 +143,7 @@ bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const; + bool selectBVHIntrinsic(MachineInstr &I) const; std::pair selectVOP3ModsImpl(MachineOperand &Root) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1746,6 +1746,8 @@ return selectSBarrier(I); case Intrinsic::amdgcn_global_atomic_fadd: return selectGlobalAtomicFaddIntrinsic(I); + case Intrinsic::amdgcn_image_bvh_intersect_ray: + return selectBVHIntrinsic(I); default: { return selectImpl(I, *CoverageInfo); } @@ -3019,6 +3021,73 @@ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } +bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + Register DstReg = MI.getOperand(0).getReg(); + Register NodePtr = MI.getOperand(2).getReg(); + Register RayExtent = MI.getOperand(3).getReg(); + Register RayOrigin = MI.getOperand(4).getReg(); + Register RayDir = MI.getOperand(5).getReg(); + Register RayInvDir = MI.getOperand(6).getReg(); + Register TDescr = MI.getOperand(7).getReg(); + + bool IsA16 = MRI->getType(RayDir).getElementType().getSizeInBits() == 16; + bool Is64 = MRI->getType(NodePtr).getSizeInBits() == 64; + unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa + : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode), DstReg); + if (Is64) + MIB.addReg(NodePtr, 0, AMDGPU::sub0) + .addReg(NodePtr, 0, AMDGPU::sub1); + else + MIB.addReg(NodePtr); + + MIB.addReg(RayExtent); + + auto packLanes = [&MIB] (Register Src) { + MIB.addReg(Src, 0, AMDGPU::sub0); + MIB.addReg(Src, 0, AMDGPU::sub1); + MIB.addReg(Src, 0, AMDGPU::sub2); + }; + + packLanes(RayOrigin); + if (IsA16) { + MIB.addReg(RayDir, 0, AMDGPU::sub0); + Register R1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register R2 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::V_PACK_B32_F16), R1) + .addImm(0) + .addReg(RayDir, 0, AMDGPU::sub1) + .addImm(0) + .addReg(RayInvDir, 0, AMDGPU::sub0) + .addImm(0) + .addImm(0); + BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), R2) + .addReg(RayInvDir, 0, AMDGPU::sub1) + .addReg(RayInvDir, 0, AMDGPU::sub0) + .addImm(16); + MIB.addReg(R1); + MIB.addReg(R2); + } else { + packLanes(RayDir); + packLanes(RayInvDir); + } + + MIB.addReg(TDescr); + if (IsA16) + MIB.addImm(1); + + MIB.cloneMemRefs(MI); + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4377,6 +4377,18 @@ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_image_bvh_intersect_ray: { + unsigned PtrSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned DirSize = getSizeInBits(MI.getOperand(5).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); + OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DirSize); + OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DirSize); + OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 128); + break; + } default: return getInvalidInstructionMapping(); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) + +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) + +define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { +; GCN-LABEL: image_bvh_intersect_ray: +; GCN: ; %bb.0: +; GCN-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3] +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { +; GCN-LABEL: image_bvh_intersect_ray_a16: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_pack_b32_f16 v5, v7, v8 +; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v5, v7], s[0:3] a16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { +; GCN-LABEL: image_bvh64_intersect_ray: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3] +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { +; GCN-LABEL: image_bvh64_intersect_ray_a16: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_pack_b32_f16 v6, v8, v9 +; GCN-NEXT: v_alignbit_b32 v8, v10, v9, 16 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v6, v8], s[0:3] a16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +}