Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1698,6 +1698,14 @@ def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn; +// uint4 llvm.amdgcn.image.bvh.intersect.ray , , , +// , ray_inv_dir>, +def int_amdgcn_image_bvh_intersect_ray : + Intrinsic<[llvm_v4i32_ty], + [llvm_anyint_ty, llvm_float_ty, llvm_v4f32_ty, llvm_anyvector_ty, + LLVMMatchType<1>, llvm_v4i32_ty], + [IntrReadMem]>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1444,6 +1444,7 @@ void cvtMIMG(MCInst &Inst, const OperandVector &Operands, bool IsAtomic = false); void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); + void cvtIntersectRay(MCInst &Inst, const OperandVector &Operands); OperandMatchResultTy parseDim(OperandVector &Operands); OperandMatchResultTy parseDPP8(OperandVector &Operands); @@ -3109,8 +3110,9 @@ int TFEIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe); assert(VDataIdx != -1); - assert(DMaskIdx != -1); - assert(TFEIdx != -1); + + if (DMaskIdx == -1 || TFEIdx == -1) // intersect_ray + return true; unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx); unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0; @@ -3137,6 +3139,7 @@ return true; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); @@ -3145,9 +3148,11 @@ assert(VAddr0Idx != -1); assert(SrsrcIdx != -1); - assert(DimIdx != -1); assert(SrsrcIdx > VAddr0Idx); + if (DimIdx == -1) + return true; // intersect_ray + unsigned Dim = Inst.getOperand(DimIdx).getImm(); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); bool IsNSA = SrsrcIdx - VAddr0Idx > 1; @@ -6466,6 +6471,17 @@ cvtMIMG(Inst, Operands, true); } +void AMDGPUAsmParser::cvtIntersectRay(MCInst &Inst, + const OperandVector &Operands) { + for (unsigned I = 1; I < Operands.size(); ++I) { + auto &Operand = (AMDGPUOperand &)*Operands[I]; + if (Operand.isReg()) + Operand.addRegOperands(Inst, 1); + } + + Inst.addOperand(MCOperand::createImm(1)); // a16 +} + //===----------------------------------------------------------------------===// // smrd //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -139,6 +139,8 @@ DECODE_OPERAND_REG(VReg_64) DECODE_OPERAND_REG(VReg_96) DECODE_OPERAND_REG(VReg_128) +DECODE_OPERAND_REG(VReg_256) +DECODE_OPERAND_REG(VReg_512) DECODE_OPERAND_REG(SReg_32) DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) @@ -499,8 +501,16 @@ AMDGPU::OpName::d16); assert(VDataIdx != -1); - assert(DMaskIdx != -1); - assert(TFEIdx != -1); + if (DMaskIdx == -1 || TFEIdx == -1) {// intersect_ray + if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16) > -1) { + assert(MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa || + MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa || + MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa || + MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa); + addOperand(MI, MCOperand::createImm(1)); + } + return MCDisassembler::Success; + } const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); bool IsAtomic = (VDstIdx != -1); Index: llvm/lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -708,6 +708,55 @@ multiclass MIMG_Gather_WQM op, AMDGPUSampleVariant sample> : MIMG_Gather; +class MIMG_IntersectRay_gfx10 + : MIMG_gfx10 { + + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc), + !if(!eq(A16,1), (ins GFX10A16:$a16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(!eq(A16,1), "$a16", ""); + + let nsa = 0; +} + +class MIMG_IntersectRay_nsa_gfx10 + : MIMG_nsa_gfx10 { + let InOperandList = !con(nsah.AddrIns, + (ins SReg_128:$srsrc), + !if(!eq(A16,1), (ins GFX10A16:$a16), (ins))); + let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(!eq(A16,1), "$a16", ""); +} + +multiclass MIMG_IntersectRay { + def "" : MIMGBaseOpcode; + let SubtargetPredicate = HasGFX10_BEncoding, + AssemblerPredicate = HasGFX10_BEncoding, + AsmMatchConverter = !if(!eq(A16,1), "cvtIntersectRay", ""), + dmask = 0xf, + unorm = 1, + d16 = 0, + glc = 0, + slc = 0, + dlc = 0, + tfe = 0, + lwe = 0, + r128 = 1, + ssamp = 0, + dim = {0, 0, 0}, + a16 = A16, + d16 = 0, + BaseOpcode = !cast(NAME), + VDataDwords = 4 in { + // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple, + // when we only need 9, 11 or 12 depending on A16 field and ptr size. + def "_sa" : MIMG_IntersectRay_gfx10.RegClass, A16> { + let VAddrDwords = !srl(MIMGAddrSize.RegClass.Size, 5); + } + def _nsa : MIMG_IntersectRay_nsa_gfx10 { + let VAddrDwords = num_addrs; + } + } +} + //===----------------------------------------------------------------------===// // MIMG Instructions //===----------------------------------------------------------------------===// @@ -832,6 +881,11 @@ let SubtargetPredicate = HasGFX10_BEncoding in defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>; +defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 11, 0>; +defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 8, 1>; +defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 12, 0>; +defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 9, 1>; + /********** ========================================= **********/ /********** Table of dimension-aware image intrinsics **********/ /********** ========================================= **********/ Index: llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -80,9 +80,8 @@ MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); - // Check for instructions that don't have tfe or lwe fields - // There shouldn't be any at this point. - assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction"); + if (!TFE && !LWE) // intersect_ray + continue; unsigned TFEVal = TFE->getImm(); unsigned LWEVal = LWE->getImm(); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1194,6 +1194,17 @@ MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_image_bvh_intersect_ray: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? + Info.ptrVal = MFI->getImagePSV( + *MF.getSubtarget().getInstrInfo(), CI.getArgOperand(5)); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -7318,6 +7329,76 @@ DAG.getVTList(VT, MVT::Other), Ops, M->getMemOperand()); } + case Intrinsic::amdgcn_image_bvh_intersect_ray: { + SDLoc DL(Op); + MemSDNode *M = cast(Op); + SDValue NodePtr = M->getOperand(2); + SDValue RayExtent = M->getOperand(3); + SDValue RayOrigin = M->getOperand(4); + SDValue RayDir = M->getOperand(5); + SDValue RayInvDir = M->getOperand(6); + SDValue TDescr = M->getOperand(7); + + assert(NodePtr.getValueType() == MVT::i32 || + NodePtr.getValueType() == MVT::i64); + assert(RayDir.getValueType() == MVT::v4f16 || + RayDir.getValueType() == MVT::v4f32); + + bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; + bool Is64 = NodePtr.getValueType() == MVT::i64; + unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa + : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + + SmallVector Ops; + + auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) { + SmallVector Lanes; + DAG.ExtractVectorElements(Op, Lanes, 0, 3); + if (Lanes[0].getValueSizeInBits() == 32) { + for (unsigned I = 0; I < 3; ++I) + Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I])); + } else { + if (IsAligned) { + Ops.push_back( + DAG.getBitcast(MVT::i32, + DAG.getBuildVector(MVT::v2f16, DL, + { Lanes[0], Lanes[1] }))); + Ops.push_back(Lanes[2]); + } else { + SDValue Elt0 = Ops.pop_back_val(); + Ops.push_back( + DAG.getBitcast(MVT::i32, + DAG.getBuildVector(MVT::v2f16, DL, + { Elt0, Lanes[0] }))); + Ops.push_back( + DAG.getBitcast(MVT::i32, + DAG.getBuildVector(MVT::v2f16, DL, + { Lanes[1], Lanes[2] }))); + } + } + }; + + if (Is64) + DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2); + else + Ops.push_back(NodePtr); + + Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); + packLanes(RayOrigin, true); + packLanes(RayDir, true); + packLanes(RayInvDir, false); + Ops.push_back(TDescr); + if (IsA16) + Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1)); + Ops.push_back(M->getChain()); + + auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); + MachineMemOperand *MemRef = M->getMemOperand(); + DAG.setNodeMemRefs(NewNode, {MemRef}); + return SDValue(NewNode, 0); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) @@ -10963,7 +11044,8 @@ unsigned Opcode = Node->getMachineOpcode(); if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && - !TII->isGather4(Opcode)) { + !TII->isGather4(Opcode) && + AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) != -1) { return adjustWritemask(Node, DAG); } Index: llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -393,6 +393,15 @@ case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B64_gfx9: return DS_WRITE; + case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa: + case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa: + case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa: + case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa: + case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa: + case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa: + case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa: + case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa: + return UNKNOWN; } } Index: llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -272,8 +272,8 @@ // enabled int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); - unsigned TFEVal = MI.getOperand(TFEIdx).getImm(); - unsigned LWEVal = MI.getOperand(LWEIdx).getImm(); + unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); + unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); int ToUntie = -1; if (TFEVal || LWEVal) { // TFE/LWE is enabled so we need to deal with an implicit tied operand Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -0,0 +1,162 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) + +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) + +; GCN-LABEL: {{^}}image_bvh_intersect_ray: +; GCN: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]{{$}} +; Arguments are flattened to represent the actual VGPR_A layout, so we have no +; extra moves in the generated kernel. +define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { +main_body: + %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16: +; GCN: image_bvh_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}} +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { +main_body: + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +; GCN-LABEL: {{^}}image_bvh64_intersect_ray: +; GCN: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]{{$}} +; Arguments are flattened to represent the actual VGPR_A layout, so we have no +; extra moves in the generated kernel. +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { +main_body: + %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 + %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16: +; GCN: image_bvh64_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}} +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { +main_body: + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %r = bitcast <4 x i32> %v to <4 x float> + ret <4 x float> %r +} + +; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs. + +; GCN-LABEL: {{^}}image_bvh_intersect_ray_nsa_reassign: +; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { +main_body: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid + %node_ptr = load i32, i32* %gep_node_ptr, align 4 + %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid + %ray_extent = load float, float* %gep_ray, align 4 + %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + store <4 x i32> %v, <4 x i32>* undef + ret void +} + +; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16_nsa_reassign: +; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}} +define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { +main_body: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid + %node_ptr = load i32, i32* %gep_node_ptr, align 4 + %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid + %ray_extent = load float, float* %gep_ray, align 4 + %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + store <4 x i32> %v, <4 x i32>* undef + ret void +} + +; GCN-LABEL: {{^}}image_bvh64_intersect_ray_nsa_reassign: +; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) { +main_body: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid + %ray_extent = load float, float* %gep_ray, align 4 + %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + store <4 x i32> %v, <4 x i32>* undef + ret void +} + +; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16_nsa_reassign: +; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}} +define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) { +main_body: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid + %ray_extent = load float, float* %gep_ray, align 4 + %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + store <4 x i32> %v, <4 x i32>* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/MC/AMDGPU/gfx1011_err.s =================================================================== --- llvm/test/MC/AMDGPU/gfx1011_err.s +++ llvm/test/MC/AMDGPU/gfx1011_err.s @@ -23,16 +23,16 @@ // GFX10: error: instruction not supported on this GPU image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] -// GFX10: error: invalid instruction +// GFX10: error: instruction not supported on this GPU image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 -// GFX10: error: invalid instruction +// GFX10: error: invalid operand image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] -// GFX10: error: invalid instruction +// GFX10: error: instruction not supported on this GPU image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 -// GFX10: error: invalid instruction +// GFX10: error: invalid operand image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D // GFX10: error: not a valid operand. Index: llvm/test/MC/AMDGPU/gfx1030_new.s =================================================================== --- llvm/test/MC/AMDGPU/gfx1030_new.s +++ llvm/test/MC/AMDGPU/gfx1030_new.s @@ -61,6 +61,30 @@ v_fma_legacy_f32 v0, s1, 2.0, -v3 // GFX10: encoding: [0x00,0x00,0x40,0xd5,0x01,0xe8,0x0d,0x84] +image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] +// GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00] + +image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 +// GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40] + +image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] +// GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00] + +image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 +// GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40] + +image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15] +// GFX10: encoding: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00] + +image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20], s[12:15] a16 +// GFX10: encoding: [0x05,0x9f,0x98,0xf1,0x32,0x27,0x03,0x40,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x00] + +image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40, v42], s[12:15] +// GFX10: encoding: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00] + +image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19], s[12:15] a16 +// GFX10: encoding: [0x05,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x40,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13] + image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D // GFX10: encoding: [0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00] Index: llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt =================================================================== --- llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt +++ llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt @@ -52,6 +52,30 @@ # GFX10: v_fma_legacy_f32 v0, s1, 2.0, -v3 0x00,0x00,0x40,0xd5,0x01,0xe8,0x0d,0x84 +# GFX10: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] +0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00 + +# GFX10: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 +0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40 + +# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] +0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00 + +# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 +0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40 + +# GFX10: image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15] +0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00 + +# GFX10: image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20], s[12:15] a16 +0x05,0x9f,0x98,0xf1,0x32,0x27,0x03,0x40,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x00 + +# GFX10: image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40, v42], s[12:15] +0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00 + +# GFX10: image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19], s[12:15] a16 +0x05,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x40,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13 + # GFX10: image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D 0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00