Diff 292675

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Show First 20 Lines • Show All 137 Lines • ▼ Show 20 Lines	private:
bool selectG_BRCOND(MachineInstr &I) const;		bool selectG_BRCOND(MachineInstr &I) const;
bool selectG_GLOBAL_VALUE(MachineInstr &I) const;		bool selectG_GLOBAL_VALUE(MachineInstr &I) const;
bool selectG_PTRMASK(MachineInstr &I) const;		bool selectG_PTRMASK(MachineInstr &I) const;
bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;		bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;
bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;		bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;		bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;		bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;		bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;
		bool selectBVHIntrinsic(MachineInstr &I) const;

std::pair<Register, unsigned>		std::pair<Register, unsigned>
selectVOP3ModsImpl(MachineOperand &Root) const;		selectVOP3ModsImpl(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns		InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand &Root) const;		selectVCSRC(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns		InstructionSelector::ComplexRendererFns
▲ Show 20 Lines • Show All 176 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Show First 20 Lines • Show All 1,740 Lines • ▼ Show 20 Lines	bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_ds_append:		case Intrinsic::amdgcn_ds_append:
return selectDSAppendConsume(I, true);		return selectDSAppendConsume(I, true);
case Intrinsic::amdgcn_ds_consume:		case Intrinsic::amdgcn_ds_consume:
return selectDSAppendConsume(I, false);		return selectDSAppendConsume(I, false);
case Intrinsic::amdgcn_s_barrier:		case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);		return selectSBarrier(I);
case Intrinsic::amdgcn_global_atomic_fadd:		case Intrinsic::amdgcn_global_atomic_fadd:
return selectGlobalAtomicFaddIntrinsic(I);		return selectGlobalAtomicFaddIntrinsic(I);
		case Intrinsic::amdgcn_image_bvh_intersect_ray:
		return selectBVHIntrinsic(I);
default: {		default: {
return selectImpl(I, *CoverageInfo);		return selectImpl(I, *CoverageInfo);
}		}
}		}
}		}

bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {		bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
if (selectImpl(I, *CoverageInfo))		if (selectImpl(I, *CoverageInfo))
▲ Show 20 Lines • Show All 1,257 Lines • ▼ Show 20 Lines	auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
.addImm(Addr.second)		.addImm(Addr.second)
.addImm(0) // SLC		.addImm(0) // SLC
.cloneMemRefs(MI);		.cloneMemRefs(MI);

MI.eraseFromParent();		MI.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);		return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}		}

		bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
		MachineBasicBlock *MBB = MI.getParent();
		const DebugLoc &DL = MI.getDebugLoc();

		Register DstReg = MI.getOperand(0).getReg();
		Register NodePtr = MI.getOperand(2).getReg();
		Register RayExtent = MI.getOperand(3).getReg();
		Register RayOrigin = MI.getOperand(4).getReg();
		Register RayDir = MI.getOperand(5).getReg();
		Register RayInvDir = MI.getOperand(6).getReg();
		Register TDescr = MI.getOperand(7).getReg();

		bool IsA16 = MRI->getType(RayDir).getElementType().getSizeInBits() == 16;
		bool Is64 = MRI->getType(NodePtr).getSizeInBits() == 64;
		unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
		: AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
		: Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
		: AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;

		auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode), DstReg);
		if (Is64) {
		arsenmUnsubmitted Done Reply Inline Actions Braces here arsenm: Braces here
		MIB.addReg(NodePtr, 0, AMDGPU::sub0)
		.addReg(NodePtr, 0, AMDGPU::sub1);
		} else {
		MIB.addReg(NodePtr);
		}

		MIB.addReg(RayExtent);

		auto packLanes = [&MIB] (Register Src) {
		MIB.addReg(Src, 0, AMDGPU::sub0);
		MIB.addReg(Src, 0, AMDGPU::sub1);
		MIB.addReg(Src, 0, AMDGPU::sub2);
		};

		packLanes(RayOrigin);
		if (IsA16) {
		MIB.addReg(RayDir, 0, AMDGPU::sub0);
		Register R1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
		Register R2 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
		Register Lo = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
		BuildMI(MBB, &MIB, DL, TII.get(AMDGPU::V_AND_B32_e64), Lo)
		.addReg(RayDir, 0, AMDGPU::sub1)
		.addImm(0xffff);
		BuildMI(MBB, &MIB, DL, TII.get(AMDGPU::V_LSHL_OR_B32), R1)
		.addReg(RayInvDir, 0, AMDGPU::sub0)
		.addImm(16)
		.addReg(Lo);
		BuildMI(MBB, &MIB, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), R2)
		.addReg(RayInvDir, 0, AMDGPU::sub1)
		.addReg(RayInvDir, 0, AMDGPU::sub0)
		.addImm(16);
		arsenmUnsubmitted Done Reply Inline Actions Can you do this during custom lowering rather than adding bit operations here late? I'm also surprised a V_PACK_B32_F16 is involved here arsenm: Can you do this during custom lowering rather than adding bit operations here late? I'm also…
		rampitecAuthorUnsubmitted Done Reply Inline Actions What kind of operations you'd like to see in the custom lowering? v_pack_b32_f16 should be fine, this is packed half type in this case. rampitec: What kind of operations you'd like to see in the custom lowering? v_pack_b32_f16 should be fine…
		arsenmUnsubmitted Done Reply Inline Actions But v_pack_b32_f16 isn't semantically the same as the bit packing, so I would be surprised to insert this for the argument handling. arsenm: But v_pack_b32_f16 isn't semantically the same as the bit packing, so I would be surprised to…
		rampitecAuthorUnsubmitted Done Reply Inline Actions That's the best instruction for the job IMO. What we are doing is repacking vector of halfs. rampitec: That's the best instruction for the job IMO. What we are doing is repacking vector of halfs.
		arsenmUnsubmitted Done Reply Inline Actions But it does change the input values. I believe this is a canonicalizing operation, so may flush denorms and quiet snans arsenm: But it does change the input values. I believe this is a canonicalizing operation, so may flush…
		rampitecAuthorUnsubmitted Done Reply Inline Actions It should behave the same as bhv itself, the mode is common right? So if flushing on the value will be flushed anyway. Everything else results in a longer code. It can use v_lshl_or_b32, but it will also need an extra v_and_b32 to clear high half. rampitec: It should behave the same as bhv itself, the mode is common right? So if flushing on the value…
		rampitecAuthorUnsubmitted Done Reply Inline Actions Doing a custom lowering would need 4 different custom nodes and then selection. It will be much more overhead. rampitec: Doing a custom lowering would need 4 different custom nodes and then selection. It will be much…
		arsenmUnsubmitted Done Reply Inline Actions You could use one wrapper instruction like the image intrinsics. We should expose the bit packing to the post-legalize combiner arsenm: You could use one wrapper instruction like the image intrinsics. We should expose the bit…
		MIB.addReg(R1);
		MIB.addReg(R2);
		} else {
		packLanes(RayDir);
		packLanes(RayInvDir);
		}

		MIB.addReg(TDescr);
		if (IsA16)
		MIB.addImm(1);

		MIB.cloneMemRefs(MI);

		MI.eraseFromParent();
		return true;
		}

bool AMDGPUInstructionSelector::select(MachineInstr &I) {		bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())		if (I.isPHI())
return selectPHI(I);		return selectPHI(I);

if (!I.isPreISelOpcode()) {		if (!I.isPreISelOpcode()) {
if (I.isCopy())		if (I.isCopy())
return selectCOPY(I);		return selectCOPY(I);
return true;		return true;
▲ Show 20 Lines • Show All 1,189 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Show First 20 Lines • Show All 3,088 Lines • ▼ Show 20 Lines	case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should this use a waterfall loop?		// FIXME: Should this use a waterfall loop?
constrainOpWithReadfirstlane(MI, MRI, 2); // M0		constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;		return;
}		}
case Intrinsic::amdgcn_s_setreg: {		case Intrinsic::amdgcn_s_setreg: {
constrainOpWithReadfirstlane(MI, MRI, 2);		constrainOpWithReadfirstlane(MI, MRI, 2);
return;		return;
}		}
		case Intrinsic::amdgcn_image_bvh_intersect_ray: {
		executeInWaterfallLoop(MI, MRI, { 7 });
		return;
		}
default: {		default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =		if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {		AMDGPU::lookupRsrcIntrinsic(IntrID)) {
// Non-images can have complications from operands that allow both SGPR		// Non-images can have complications from operands that allow both SGPR
// and VGPR. For now it's too complicated to figure out the final opcode		// and VGPR. For now it's too complicated to figure out the final opcode
// to derive the register bank from the MCInstrDesc.		// to derive the register bank from the MCInstrDesc.
if (RSrcIntrin->IsImage) {		if (RSrcIntrin->IsImage) {
applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);		applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
▲ Show 20 Lines • Show All 1,267 Lines • ▼ Show 20 Lines	case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
case Intrinsic::amdgcn_ds_gws_sema_p:		case Intrinsic::amdgcn_ds_gws_sema_p:
case Intrinsic::amdgcn_ds_gws_sema_release_all: {		case Intrinsic::amdgcn_ds_gws_sema_release_all: {
// This must be an SGPR, but accept a VGPR.		// This must be an SGPR, but accept a VGPR.
unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,		unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
AMDGPU::SGPRRegBankID);		AMDGPU::SGPRRegBankID);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);		OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;		break;
}		}
		case Intrinsic::amdgcn_image_bvh_intersect_ray: {
		unsigned PtrSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
		unsigned DirSize = getSizeInBits(MI.getOperand(5).getReg(), MRI, *TRI);
		OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
		OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
		OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
		OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
		OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DirSize);
		OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DirSize);
		OpdsMapping[7] = getSGPROpMapping(MI.getOperand(7).getReg(), MRI, *TRI);
		arsenmUnsubmitted Done Reply Inline Actions Missing waterfall loop for SGPR operand arsenm: Missing waterfall loop for SGPR operand
		rampitecAuthorUnsubmitted Done Reply Inline Actions That's a descriptor, I'd rather refuse to select. rampitec: That's a descriptor, I'd rather refuse to select.
		arsenmUnsubmitted Done Reply Inline Actions You can never guarantee the input is uniform or in a VGPR. We can do the right thing now easily (and every other intrinsic with a descriptor does it) arsenm: You can never guarantee the input is uniform or in a VGPR. We can do the right thing now easily…
		break;
		}
default:		default:
return getInvalidInstructionMapping();		return getInvalidInstructionMapping();
}		}
break;		break;
}		}
case AMDGPU::G_SELECT: {		case AMDGPU::G_SELECT: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();		unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,		unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
▲ Show 20 Lines • Show All 85 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

				; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
				; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
				; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
				; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)

				declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
				declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
				declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
				declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)

				define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
				; GCN-LABEL: image_bvh_intersect_ray:
				; GCN: ; %bb.0:
				; GCN-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
				; GCN-NEXT: ; implicit-def: $vcc_hi
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: ; return to shader part epilog
				%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
				%r = bitcast <4 x i32> %v to <4 x float>
				ret <4 x float> %r
				}

				define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
				; GCN-LABEL: image_bvh_intersect_ray_a16:
				; GCN: ; %bb.0:
				; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v7
				; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16
				; GCN-NEXT: ; implicit-def: $vcc_hi
				; GCN-NEXT: v_lshl_or_b32 v5, v8, 16, v5
				; GCN-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v5, v7], s[0:3] a16
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: ; return to shader part epilog
				%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
				%r = bitcast <4 x i32> %v to <4 x float>
				ret <4 x float> %r
				}

				define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
				; GCN-LABEL: image_bvh64_intersect_ray:
				; GCN: ; %bb.0:
				; GCN-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
				; GCN-NEXT: ; implicit-def: $vcc_hi
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: ; return to shader part epilog
				%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
				%r = bitcast <4 x i32> %v to <4 x float>
				ret <4 x float> %r
				}

				define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
				; GCN-LABEL: image_bvh64_intersect_ray_a16:
				; GCN: ; %bb.0:
				; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8
				; GCN-NEXT: v_alignbit_b32 v8, v10, v9, 16
				; GCN-NEXT: ; implicit-def: $vcc_hi
				; GCN-NEXT: v_lshl_or_b32 v6, v9, 16, v6
				; GCN-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v6, v8], s[0:3] a16
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: ; return to shader part epilog
				%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
				%r = bitcast <4 x i32> %v to <4 x float>
				ret <4 x float> %r
				}

				define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
				arsenmUnsubmitted Done Reply Inline Actions Should include a case where this needs a waterfall loop arsenm: Should include a case where this needs a waterfall loop
				; GCN-LABEL: image_bvh_intersect_ray_vgpr_descr:
				; GCN: ; %bb.0:
				; GCN-NEXT: s_mov_b32 s1, exec_lo
				; GCN-NEXT: ; implicit-def: $vcc_hi
				; GCN-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1
				; GCN-NEXT: v_readfirstlane_b32 s4, v14
				; GCN-NEXT: v_readfirstlane_b32 s5, v15
				; GCN-NEXT: v_readfirstlane_b32 s6, v16
				; GCN-NEXT: v_readfirstlane_b32 s7, v17
				; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
				; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
				; GCN-NEXT: s_nop 2
				; GCN-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
				; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
				; GCN-NEXT: s_and_saveexec_b32 s0, s0
				; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
				; GCN-NEXT: s_cbranch_execnz BB4_1
				; GCN-NEXT: ; %bb.2:
				; GCN-NEXT: s_mov_b32 exec_lo, s1
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_mov_b32_e32 v0, v18
				; GCN-NEXT: v_mov_b32_e32 v1, v19
				; GCN-NEXT: v_mov_b32_e32 v2, v20
				; GCN-NEXT: v_mov_b32_e32 v3, v21
				; GCN-NEXT: ; return to shader part epilog
				%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
				%r = bitcast <4 x i32> %v to <4 x float>
				ret <4 x float> %r
				}

				define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
				; GCN-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
				; GCN: ; %bb.0:
				; GCN-NEXT: s_mov_b32 s1, exec_lo
				; GCN-NEXT: ; implicit-def: $vcc_hi
				; GCN-NEXT: BB5_1: ; =>This Inner Loop Header: Depth=1
				; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v7
				; GCN-NEXT: v_readfirstlane_b32 s4, v10
				; GCN-NEXT: v_readfirstlane_b32 s5, v11
				; GCN-NEXT: v_readfirstlane_b32 s6, v12
				; GCN-NEXT: v_readfirstlane_b32 s7, v13
				; GCN-NEXT: v_lshl_or_b32 v5, v8, 16, v5
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_alignbit_b32 v14, v9, v8, 16
				; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
				; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
				; GCN-NEXT: s_nop 0
				; GCN-NEXT: image_bvh_intersect_ray v[14:17], [v0, v1, v2, v3, v4, v6, v5, v14], s[4:7] a16
				; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
				; GCN-NEXT: s_and_saveexec_b32 s0, s0
				; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
				; GCN-NEXT: s_cbranch_execnz BB5_1
				; GCN-NEXT: ; %bb.2:
				; GCN-NEXT: s_mov_b32 exec_lo, s1
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_mov_b32_e32 v0, v14
				; GCN-NEXT: v_mov_b32_e32 v1, v15
				; GCN-NEXT: v_mov_b32_e32 v2, v16
				; GCN-NEXT: v_mov_b32_e32 v3, v17
				; GCN-NEXT: ; return to shader part epilog
				%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
				%r = bitcast <4 x i32> %v to <4 x float>
				ret <4 x float> %r
				}

				define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
				; GCN-LABEL: image_bvh64_intersect_ray_vgpr_descr:
				; GCN: ; %bb.0:
				; GCN-NEXT: s_mov_b32 s1, exec_lo
				; GCN-NEXT: ; implicit-def: $vcc_hi
				; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
				; GCN-NEXT: v_readfirstlane_b32 s4, v15
				; GCN-NEXT: v_readfirstlane_b32 s5, v16
				; GCN-NEXT: v_readfirstlane_b32 s6, v17
				; GCN-NEXT: v_readfirstlane_b32 s7, v18
				; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
				; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
				; GCN-NEXT: s_nop 2
				; GCN-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
				; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
				; GCN-NEXT: s_and_saveexec_b32 s0, s0
				; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
				; GCN-NEXT: s_cbranch_execnz BB6_1
				; GCN-NEXT: ; %bb.2:
				; GCN-NEXT: s_mov_b32 exec_lo, s1
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_mov_b32_e32 v0, v19
				; GCN-NEXT: v_mov_b32_e32 v1, v20
				; GCN-NEXT: v_mov_b32_e32 v2, v21
				; GCN-NEXT: v_mov_b32_e32 v3, v22
				; GCN-NEXT: ; return to shader part epilog
				%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
				%r = bitcast <4 x i32> %v to <4 x float>
				ret <4 x float> %r
				}

				define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
				; GCN-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
				; GCN: ; %bb.0:
				; GCN-NEXT: s_mov_b32 s1, exec_lo
				; GCN-NEXT: ; implicit-def: $vcc_hi
				; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1
				; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8
				; GCN-NEXT: v_readfirstlane_b32 s4, v11
				; GCN-NEXT: v_readfirstlane_b32 s5, v12
				; GCN-NEXT: v_readfirstlane_b32 s6, v13
				; GCN-NEXT: v_readfirstlane_b32 s7, v14
				; GCN-NEXT: v_lshl_or_b32 v6, v9, 16, v6
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_alignbit_b32 v15, v10, v9, 16
				; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
				; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
				; GCN-NEXT: s_nop 0
				; GCN-NEXT: image_bvh64_intersect_ray v[15:18], [v0, v1, v2, v3, v4, v5, v7, v6, v15], s[4:7] a16
				; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
				; GCN-NEXT: s_and_saveexec_b32 s0, s0
				; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
				; GCN-NEXT: s_cbranch_execnz BB7_1
				; GCN-NEXT: ; %bb.2:
				; GCN-NEXT: s_mov_b32 exec_lo, s1
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_mov_b32_e32 v0, v15
				; GCN-NEXT: v_mov_b32_e32 v1, v16
				; GCN-NEXT: v_mov_b32_e32 v2, v17
				; GCN-NEXT: v_mov_b32_e32 v3, v18
				; GCN-NEXT: ; return to shader part epilog
				%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
				%r = bitcast <4 x i32> %v to <4 x float>
				ret <4 x float> %r
				}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] global-isel support for RT
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 292675

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] global-isel support for RTClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 292675

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

[AMDGPU] global-isel support for RT
ClosedPublic