Diff 363433

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Show First 20 Lines • Show All 4,764 Lines • ▼ Show 20 Lines	bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
if (!ST.hasGFX10_AEncoding()) {		if (!ST.hasGFX10_AEncoding()) {
DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),		DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
"intrinsic not supported on subtarget",		"intrinsic not supported on subtarget",
MI.getDebugLoc());		MI.getDebugLoc());
B.getMF().getFunction().getContext().diagnose(BadIntrin);		B.getMF().getFunction().getContext().diagnose(BadIntrin);
return false;		return false;
}		}

bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;		const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;		const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa		const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
: AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa		const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
: Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa		const unsigned Opcodes[2][2][2] = {
: AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;		{{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
		AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
		{AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
		AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
		{{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
		AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
		{AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
		AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
		const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];

SmallVector<Register, 12> Ops;		SmallVector<Register, 12> Ops;
if (Is64) {		if (Is64) {
auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);		auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
Ops.push_back(Unmerge.getReg(0));		Ops.push_back(Unmerge.getReg(0));
Ops.push_back(Unmerge.getReg(1));		Ops.push_back(Unmerge.getReg(1));
} else {		} else {
Ops.push_back(NodePtr);		Ops.push_back(NodePtr);
		foadUnsubmitted Done Reply Inline Actions Maybe use a 2 by 2 by 2 array of opcodes? foad: Maybe use a 2 by 2 by 2 array of opcodes?
}		}
Ops.push_back(RayExtent);		Ops.push_back(RayExtent);

auto packLanes = [&Ops, &S32, &B] (Register Src) {		auto packLanes = [&Ops, &S32, &B] (Register Src) {
auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);		auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
Ops.push_back(Unmerge.getReg(0));		Ops.push_back(Unmerge.getReg(0));
Ops.push_back(Unmerge.getReg(1));		Ops.push_back(Unmerge.getReg(1));
Ops.push_back(Unmerge.getReg(2));		Ops.push_back(Unmerge.getReg(2));
Show All 12 Lines	if (IsA16) {
Ops.push_back(R1);		Ops.push_back(R1);
Ops.push_back(R2);		Ops.push_back(R2);
Ops.push_back(R3);		Ops.push_back(R3);
} else {		} else {
packLanes(RayDir);		packLanes(RayDir);
packLanes(RayInvDir);		packLanes(RayInvDir);
}		}

		if (!UseNSA) {
		// Build a single vector containing all the operands so far prepared.
		LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
		foadUnsubmitted Done Reply Inline Actions Why do we have to round up to 8 or 16? foad: Why do we have to round up to 8 or 16?
		critsonAuthorUnsubmitted Done Reply Inline Actions I cannot remember why I did this for GlobalIsel. It certainly works without, perhaps it didn't when I first wrote this. critson: I cannot remember why I did this for GlobalIsel. It certainly works without, perhaps it didn't…
		Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
		Ops.clear();
		foadUnsubmitted Done Reply Inline Actions Nit: we generally avoid explicit createGenericVirtualRegister calls. You can write: `Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);` foad: Nit: we generally avoid explicit createGenericVirtualRegister calls. You can write: `Register…
		critsonAuthorUnsubmitted Done Reply Inline Actions Sure, I think I just copied the style of the code above. critson: Sure, I think I just copied the style of the code above.
		Ops.push_back(MergedOps);
		}
		foadUnsubmitted Done Reply Inline Actions Can this be undef instead of 0? Can we push the same register N times instead of creating N different registers? foad: Can this be undef instead of 0? Can we push the same register N times instead of creating N…

auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)		auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
.addDef(DstReg)		.addDef(DstReg)
.addImm(Opcode);		.addImm(Opcode);

for (Register R : Ops) {		for (Register R : Ops) {
MIB.addUse(R);		MIB.addUse(R);
}		}

▲ Show 20 Lines • Show All 231 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Show First 20 Lines • Show All 4,249 Lines • ▼ Show 20 Lines	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
// to derive the register bank from the MCInstrDesc.		// to derive the register bank from the MCInstrDesc.
assert(RSrcIntrin->IsImage);		assert(RSrcIntrin->IsImage);
return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);		return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
}		}
case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {		case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
unsigned N = MI.getNumExplicitOperands() - 2;		unsigned N = MI.getNumExplicitOperands() - 2;
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);		OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);		OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
		if (N == 3) {
		// Sequential form: all operands combined into VGPR256/VGPR512
		unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
		foadUnsubmitted Done Reply Inline Actions Why does this hard code 512, when the comment says 256 or 512? foad: Why does this hard code 512, when the comment says 256 or 512?
		if (Size > 256)
		Size = 512;
		OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
		} else {
		// NSA form
for (unsigned I = 2; I < N; ++I)		for (unsigned I = 2; I < N; ++I)
OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);		OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
		}
break;		break;
}		}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {		case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
auto IntrID = MI.getIntrinsicID();		auto IntrID = MI.getIntrinsicID();
switch (IntrID) {		switch (IntrID) {
case Intrinsic::amdgcn_s_getreg:		case Intrinsic::amdgcn_s_getreg:
case Intrinsic::amdgcn_s_memtime:		case Intrinsic::amdgcn_s_memtime:
case Intrinsic::amdgcn_s_memrealtime:		case Intrinsic::amdgcn_s_memrealtime:
▲ Show 20 Lines • Show All 232 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,335 Lines • ▼ Show 20 Lines	case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);		auto *M = cast<MemSDNode>(Op);
updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);		updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);

return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,		return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());		Op->getVTList(), Ops, VT, M->getMemOperand());
}		}
case Intrinsic::amdgcn_image_bvh_intersect_ray: {		case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SDLoc DL(Op);
MemSDNode *M = cast<MemSDNode>(Op);		MemSDNode *M = cast<MemSDNode>(Op);
SDValue NodePtr = M->getOperand(2);		SDValue NodePtr = M->getOperand(2);
SDValue RayExtent = M->getOperand(3);		SDValue RayExtent = M->getOperand(3);
SDValue RayOrigin = M->getOperand(4);		SDValue RayOrigin = M->getOperand(4);
SDValue RayDir = M->getOperand(5);		SDValue RayDir = M->getOperand(5);
SDValue RayInvDir = M->getOperand(6);		SDValue RayInvDir = M->getOperand(6);
SDValue TDescr = M->getOperand(7);		SDValue TDescr = M->getOperand(7);

assert(NodePtr.getValueType() == MVT::i32 \|\|		assert(NodePtr.getValueType() == MVT::i32 \|\|
NodePtr.getValueType() == MVT::i64);		NodePtr.getValueType() == MVT::i64);
assert(RayDir.getValueType() == MVT::v4f16 \|\|		assert(RayDir.getValueType() == MVT::v4f16 \|\|
RayDir.getValueType() == MVT::v4f32);		RayDir.getValueType() == MVT::v4f32);

if (!Subtarget->hasGFX10_AEncoding()) {		if (!Subtarget->hasGFX10_AEncoding()) {
emitRemovedIntrinsicError(DAG, DL, Op.getValueType());		emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
return SDValue();		return SDValue();
}		}

bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;		const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
bool Is64 = NodePtr.getValueType() == MVT::i64;		const bool Is64 = NodePtr.getValueType() == MVT::i64;
unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa		const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
: AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa		const bool UseNSA =
: Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa		Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
: AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;		const unsigned Opcodes[2][2][2] = {
		{{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
		AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
		{AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
		AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
		{{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
		AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
		{AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
		AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
		const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];

SmallVector<SDValue, 16> Ops;		SmallVector<SDValue, 16> Ops;

auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {		auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
SmallVector<SDValue, 3> Lanes;		SmallVector<SDValue, 3> Lanes;
DAG.ExtractVectorElements(Op, Lanes, 0, 3);		DAG.ExtractVectorElements(Op, Lanes, 0, 3);
if (Lanes[0].getValueSizeInBits() == 32) {		if (Lanes[0].getValueSizeInBits() == 32) {
for (unsigned I = 0; I < 3; ++I)		for (unsigned I = 0; I < 3; ++I)
Show All 23 Lines	if (Is64)
DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2);		DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2);
else		else
Ops.push_back(NodePtr);		Ops.push_back(NodePtr);

Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));		Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
packLanes(RayOrigin, true);		packLanes(RayOrigin, true);
packLanes(RayDir, true);		packLanes(RayDir, true);
packLanes(RayInvDir, false);		packLanes(RayInvDir, false);

		if (!UseNSA) {
		// Build a single vector containing all the operands so far prepared.
		if (NumVAddrs > 8) {
		foadUnsubmitted Done Reply Inline Actions Same question as for globalisel: do we need to round up at all here? Rounding up to 8 certainly seems odd now that we have v5, v6, v7 classes. foad: Same question as for globalisel: do we need to round up at all here? Rounding up to 8 certainly…
		critsonAuthorUnsubmitted Done Reply Inline Actions BVH minimum size is 256-bits, so the new MIMG v5/v6/v7 are not relevant here. I have however rewritten this code to only do anything above 8 VGPRs. critson: BVH minimum size is 256-bits, so the new MIMG v5/v6/v7 are not relevant here. I have however…
		SDValue Undef = DAG.getUNDEF(MVT::i32);
		Ops.append(16 - Ops.size(), Undef);
		foadUnsubmitted Done Reply Inline Actions Can we use undef instead of zero to avoid having to materialise a constant? foad: Can we use undef instead of zero to avoid having to materialise a constant?
		foadUnsubmitted Not Done Reply Inline Actions Nit: I think you can do `Ops.append(16 - Ops.size(), Undef)`. foad: Nit: I think you can do `Ops.append(16 - Ops.size(), Undef)`.
		}
		assert(Ops.size() == 8 \|\| Ops.size() == 16);
		SDValue MergedOps = DAG.getBuildVector(
		Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops);
		Ops.clear();
		Ops.push_back(MergedOps);
		}

Ops.push_back(TDescr);		Ops.push_back(TDescr);
if (IsA16)		if (IsA16)
Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));		Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
Ops.push_back(M->getChain());		Ops.push_back(M->getChain());

auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);		auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
MachineMemOperand *MemRef = M->getMemOperand();		MachineMemOperand *MemRef = M->getMemOperand();
DAG.setNodeMemRefs(NewNode, {MemRef});		DAG.setNodeMemRefs(NewNode, {MemRef});
▲ Show 20 Lines • Show All 4,857 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

	Show All 9 Lines

	declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)			declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
	declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)			declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
	declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)			declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
	declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)			declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
	declare i32 @llvm.amdgcn.workitem.id.x()			declare i32 @llvm.amdgcn.workitem.id.x()

	define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {			define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
	; GCN-LABEL: image_bvh_intersect_ray:			; GFX1030-LABEL: image_bvh_intersect_ray:
	; GCN: ; %bb.0:			; GFX1030: ; %bb.0:
	; GCN-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]			; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GFX1030-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: ; return to shader part epilog			; GFX1030-NEXT: ; return to shader part epilog
				;
				; GFX1013-LABEL: image_bvh_intersect_ray:
				; GFX1013: ; %bb.0:
				; GFX1013-NEXT: v_mov_b32_e32 v5, v6
				; GFX1013-NEXT: v_mov_b32_e32 v6, v7
				; GFX1013-NEXT: v_mov_b32_e32 v7, v8
				; GFX1013-NEXT: v_mov_b32_e32 v8, v10
				; GFX1013-NEXT: v_mov_b32_e32 v9, v11
				; GFX1013-NEXT: v_mov_b32_e32 v10, v12
				; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
				; GFX1013-NEXT: s_waitcnt vmcnt(0)
				; GFX1013-NEXT: ; return to shader part epilog
	; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget			; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)			%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>			%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r			ret <4 x float> %r
	}			}

	define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {			define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
	; GCN-LABEL: image_bvh_intersect_ray_flat:			; GCN-LABEL: image_bvh_intersect_ray_flat:
	Show All 32 Lines
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GCN-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: ; return to shader part epilog			; GCN-NEXT: ; return to shader part epilog
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)			%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>			%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r			ret <4 x float> %r
	}			}

	define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {			define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
	; GCN-LABEL: image_bvh64_intersect_ray:			; GFX1030-LABEL: image_bvh64_intersect_ray:
	; GCN: ; %bb.0:			; GFX1030: ; %bb.0:
	; GCN-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]			; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GFX1030-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: ; return to shader part epilog			; GFX1030-NEXT: ; return to shader part epilog
				;
				; GFX1013-LABEL: image_bvh64_intersect_ray:
				; GFX1013: ; %bb.0:
				; GFX1013-NEXT: v_mov_b32_e32 v6, v7
				; GFX1013-NEXT: v_mov_b32_e32 v7, v8
				; GFX1013-NEXT: v_mov_b32_e32 v8, v9
				; GFX1013-NEXT: v_mov_b32_e32 v9, v11
				; GFX1013-NEXT: v_mov_b32_e32 v10, v12
				; GFX1013-NEXT: v_mov_b32_e32 v11, v13
				; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
				; GFX1013-NEXT: s_waitcnt vmcnt(0)
				; GFX1013-NEXT: ; return to shader part epilog
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)			%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>			%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r			ret <4 x float> %r
	}			}

	define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {			define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
	; GCN-LABEL: image_bvh64_intersect_ray_flat:			; GCN-LABEL: image_bvh64_intersect_ray_flat:
	; GCN: ; %bb.0:			; GCN: ; %bb.0:
	Show All 32 Lines
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GCN-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: ; return to shader part epilog			; GCN-NEXT: ; return to shader part epilog
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)			%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>			%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r			ret <4 x float> %r
	}			}

	define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {			define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
	; GCN-LABEL: image_bvh_intersect_ray_vgpr_descr:			; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
	; GCN: ; %bb.0:			; GFX1030: ; %bb.0:
	; GCN-NEXT: s_mov_b32 s1, exec_lo			; GFX1030-NEXT: s_mov_b32 s1, exec_lo
	; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1			; GFX1030-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
	; GCN-NEXT: v_readfirstlane_b32 s4, v14			; GFX1030-NEXT: v_readfirstlane_b32 s4, v14
	; GCN-NEXT: v_readfirstlane_b32 s5, v15			; GFX1030-NEXT: v_readfirstlane_b32 s5, v15
	; GCN-NEXT: v_readfirstlane_b32 s6, v16			; GFX1030-NEXT: v_readfirstlane_b32 s6, v16
	; GCN-NEXT: v_readfirstlane_b32 s7, v17			; GFX1030-NEXT: v_readfirstlane_b32 s7, v17
	; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]			; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
	; GCN-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]			; GFX1030-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
	; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]			; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
	; GCN-NEXT: s_and_b32 s0, s0, vcc_lo			; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
	; GCN-NEXT: s_and_saveexec_b32 s0, s0			; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
	; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0			; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
	; GCN-NEXT: s_cbranch_execnz BB6_1			; GFX1030-NEXT: s_cbranch_execnz BB6_1
	; GCN-NEXT: ; %bb.2:			; GFX1030-NEXT: ; %bb.2:
	; GCN-NEXT: s_mov_b32 exec_lo, s1			; GFX1030-NEXT: s_mov_b32 exec_lo, s1
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GFX1030-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: v_mov_b32_e32 v0, v18			; GFX1030-NEXT: v_mov_b32_e32 v0, v18
	; GCN-NEXT: v_mov_b32_e32 v1, v19			; GFX1030-NEXT: v_mov_b32_e32 v1, v19
	; GCN-NEXT: v_mov_b32_e32 v2, v20			; GFX1030-NEXT: v_mov_b32_e32 v2, v20
	; GCN-NEXT: v_mov_b32_e32 v3, v21			; GFX1030-NEXT: v_mov_b32_e32 v3, v21
	; GCN-NEXT: ; return to shader part epilog			; GFX1030-NEXT: ; return to shader part epilog
				;
				; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
				; GFX1013: ; %bb.0:
				; GFX1013-NEXT: v_mov_b32_e32 v5, v6
				; GFX1013-NEXT: v_mov_b32_e32 v6, v7
				; GFX1013-NEXT: v_mov_b32_e32 v7, v8
				; GFX1013-NEXT: v_mov_b32_e32 v8, v10
				; GFX1013-NEXT: v_mov_b32_e32 v9, v11
				; GFX1013-NEXT: v_mov_b32_e32 v10, v12
				; GFX1013-NEXT: s_mov_b32 s1, exec_lo
				; GFX1013-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
				; GFX1013-NEXT: v_readfirstlane_b32 s4, v14
				; GFX1013-NEXT: v_readfirstlane_b32 s5, v15
				; GFX1013-NEXT: v_readfirstlane_b32 s6, v16
				; GFX1013-NEXT: v_readfirstlane_b32 s7, v17
				; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
				; GFX1013-NEXT: image_bvh_intersect_ray v[18:21], v[0:15], s[4:7]
				; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
				; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
				; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
				; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
				; GFX1013-NEXT: s_cbranch_execnz BB6_1
				; GFX1013-NEXT: ; %bb.2:
				; GFX1013-NEXT: s_mov_b32 exec_lo, s1
				; GFX1013-NEXT: s_waitcnt vmcnt(0)
				; GFX1013-NEXT: v_mov_b32_e32 v0, v18
				; GFX1013-NEXT: v_mov_b32_e32 v1, v19
				; GFX1013-NEXT: v_mov_b32_e32 v2, v20
				; GFX1013-NEXT: v_mov_b32_e32 v3, v21
				; GFX1013-NEXT: ; return to shader part epilog
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)			%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>			%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r			ret <4 x float> %r
	}			}

	define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {			define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
	; GCN-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:			; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
	; GCN: ; %bb.0:			; GFX1030: ; %bb.0:
	; GCN-NEXT: s_mov_b32 s0, 0xffff			; GFX1030-NEXT: s_mov_b32 s0, 0xffff
	; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6			; GFX1030-NEXT: v_lshrrev_b32_e32 v5, 16, v6
	; GCN-NEXT: v_and_b32_e32 v14, s0, v8			; GFX1030-NEXT: v_and_b32_e32 v14, s0, v8
	; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8			; GFX1030-NEXT: v_lshrrev_b32_e32 v8, 16, v8
	; GCN-NEXT: v_and_b32_e32 v15, s0, v9			; GFX1030-NEXT: v_and_b32_e32 v15, s0, v9
	; GCN-NEXT: s_mov_b32 s1, exec_lo			; GFX1030-NEXT: s_mov_b32 s1, exec_lo
	; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5			; GFX1030-NEXT: v_lshlrev_b32_e32 v5, 16, v5
	; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14			; GFX1030-NEXT: v_lshlrev_b32_e32 v14, 16, v14
	; GCN-NEXT: v_lshl_or_b32 v15, v15, 16, v8			; GFX1030-NEXT: v_lshl_or_b32 v15, v15, 16, v8
	; GCN-NEXT: v_and_or_b32 v9, v6, s0, v5			; GFX1030-NEXT: v_and_or_b32 v9, v6, s0, v5
	; GCN-NEXT: v_and_or_b32 v14, v7, s0, v14			; GFX1030-NEXT: v_and_or_b32 v14, v7, s0, v14
	; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1			; GFX1030-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1
	; GCN-NEXT: v_readfirstlane_b32 s4, v10			; GFX1030-NEXT: v_readfirstlane_b32 s4, v10
	; GCN-NEXT: v_readfirstlane_b32 s5, v11			; GFX1030-NEXT: v_readfirstlane_b32 s5, v11
	; GCN-NEXT: v_readfirstlane_b32 s6, v12			; GFX1030-NEXT: v_readfirstlane_b32 s6, v12
	; GCN-NEXT: v_readfirstlane_b32 s7, v13			; GFX1030-NEXT: v_readfirstlane_b32 s7, v13
	; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]			; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
	; GCN-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16			; GFX1030-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16
	; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]			; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
	; GCN-NEXT: s_and_b32 s0, s0, vcc_lo			; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
	; GCN-NEXT: s_and_saveexec_b32 s0, s0			; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
	; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0			; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
	; GCN-NEXT: s_cbranch_execnz BB7_1			; GFX1030-NEXT: s_cbranch_execnz BB7_1
	; GCN-NEXT: ; %bb.2:			; GFX1030-NEXT: ; %bb.2:
	; GCN-NEXT: s_mov_b32 exec_lo, s1			; GFX1030-NEXT: s_mov_b32 exec_lo, s1
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GFX1030-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: v_mov_b32_e32 v0, v5			; GFX1030-NEXT: v_mov_b32_e32 v0, v5
	; GCN-NEXT: v_mov_b32_e32 v1, v6			; GFX1030-NEXT: v_mov_b32_e32 v1, v6
	; GCN-NEXT: v_mov_b32_e32 v2, v7			; GFX1030-NEXT: v_mov_b32_e32 v2, v7
	; GCN-NEXT: v_mov_b32_e32 v3, v8			; GFX1030-NEXT: v_mov_b32_e32 v3, v8
	; GCN-NEXT: ; return to shader part epilog			; GFX1030-NEXT: ; return to shader part epilog
				;
				; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
				; GFX1013: ; %bb.0:
				; GFX1013-NEXT: s_mov_b32 s0, 0xffff
				; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6
				; GFX1013-NEXT: v_and_b32_e32 v14, s0, v8
				; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8
				; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9
				; GFX1013-NEXT: s_mov_b32 s1, exec_lo
				; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5
				; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
				; GFX1013-NEXT: v_and_or_b32 v5, v6, s0, v5
				; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v14
				; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8
				; GFX1013-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1
				; GFX1013-NEXT: v_readfirstlane_b32 s4, v10
				; GFX1013-NEXT: v_readfirstlane_b32 s5, v11
				; GFX1013-NEXT: v_readfirstlane_b32 s6, v12
				; GFX1013-NEXT: v_readfirstlane_b32 s7, v13
				; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
				; GFX1013-NEXT: image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16
				; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
				; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
				; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
				; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
				; GFX1013-NEXT: s_cbranch_execnz BB7_1
				; GFX1013-NEXT: ; %bb.2:
				; GFX1013-NEXT: s_mov_b32 exec_lo, s1
				; GFX1013-NEXT: s_waitcnt vmcnt(0)
				; GFX1013-NEXT: v_mov_b32_e32 v0, v14
				; GFX1013-NEXT: v_mov_b32_e32 v1, v15
				; GFX1013-NEXT: v_mov_b32_e32 v2, v16
				; GFX1013-NEXT: v_mov_b32_e32 v3, v17
				; GFX1013-NEXT: ; return to shader part epilog
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)			%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>			%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r			ret <4 x float> %r
	}			}

	define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {			define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
	; GCN-LABEL: image_bvh64_intersect_ray_vgpr_descr:			; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
	; GCN: ; %bb.0:			; GFX1030: ; %bb.0:
	; GCN-NEXT: s_mov_b32 s1, exec_lo			; GFX1030-NEXT: s_mov_b32 s1, exec_lo
	; GCN-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1			; GFX1030-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1
	; GCN-NEXT: v_readfirstlane_b32 s4, v15			; GFX1030-NEXT: v_readfirstlane_b32 s4, v15
	; GCN-NEXT: v_readfirstlane_b32 s5, v16			; GFX1030-NEXT: v_readfirstlane_b32 s5, v16
	; GCN-NEXT: v_readfirstlane_b32 s6, v17			; GFX1030-NEXT: v_readfirstlane_b32 s6, v17
	; GCN-NEXT: v_readfirstlane_b32 s7, v18			; GFX1030-NEXT: v_readfirstlane_b32 s7, v18
	; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]			; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
	; GCN-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]			; GFX1030-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
	; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]			; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
	; GCN-NEXT: s_and_b32 s0, s0, vcc_lo			; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
	; GCN-NEXT: s_and_saveexec_b32 s0, s0			; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
	; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0			; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
	; GCN-NEXT: s_cbranch_execnz BB8_1			; GFX1030-NEXT: s_cbranch_execnz BB8_1
	; GCN-NEXT: ; %bb.2:			; GFX1030-NEXT: ; %bb.2:
	; GCN-NEXT: s_mov_b32 exec_lo, s1			; GFX1030-NEXT: s_mov_b32 exec_lo, s1
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GFX1030-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: v_mov_b32_e32 v0, v19			; GFX1030-NEXT: v_mov_b32_e32 v0, v19
	; GCN-NEXT: v_mov_b32_e32 v1, v20			; GFX1030-NEXT: v_mov_b32_e32 v1, v20
	; GCN-NEXT: v_mov_b32_e32 v2, v21			; GFX1030-NEXT: v_mov_b32_e32 v2, v21
	; GCN-NEXT: v_mov_b32_e32 v3, v22			; GFX1030-NEXT: v_mov_b32_e32 v3, v22
	; GCN-NEXT: ; return to shader part epilog			; GFX1030-NEXT: ; return to shader part epilog
				;
				; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
				; GFX1013: ; %bb.0:
				; GFX1013-NEXT: v_mov_b32_e32 v6, v7
				; GFX1013-NEXT: v_mov_b32_e32 v7, v8
				; GFX1013-NEXT: v_mov_b32_e32 v8, v9
				; GFX1013-NEXT: v_mov_b32_e32 v9, v11
				; GFX1013-NEXT: v_mov_b32_e32 v10, v12
				; GFX1013-NEXT: v_mov_b32_e32 v11, v13
				; GFX1013-NEXT: s_mov_b32 s1, exec_lo
				; GFX1013-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1
				; GFX1013-NEXT: v_readfirstlane_b32 s4, v15
				; GFX1013-NEXT: v_readfirstlane_b32 s5, v16
				; GFX1013-NEXT: v_readfirstlane_b32 s6, v17
				; GFX1013-NEXT: v_readfirstlane_b32 s7, v18
				; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
				; GFX1013-NEXT: image_bvh64_intersect_ray v[19:22], v[0:15], s[4:7]
				; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
				; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
				; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
				; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
				; GFX1013-NEXT: s_cbranch_execnz BB8_1
				; GFX1013-NEXT: ; %bb.2:
				; GFX1013-NEXT: s_mov_b32 exec_lo, s1
				; GFX1013-NEXT: s_waitcnt vmcnt(0)
				; GFX1013-NEXT: v_mov_b32_e32 v0, v19
				; GFX1013-NEXT: v_mov_b32_e32 v1, v20
				; GFX1013-NEXT: v_mov_b32_e32 v2, v21
				; GFX1013-NEXT: v_mov_b32_e32 v3, v22
				; GFX1013-NEXT: ; return to shader part epilog
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)			%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>			%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r			ret <4 x float> %r
	}			}

	define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {			define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
	; GCN-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:			; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
	; GCN: ; %bb.0:			; GFX1030: ; %bb.0:
	; GCN-NEXT: s_mov_b32 s0, 0xffff			; GFX1030-NEXT: s_mov_b32 s0, 0xffff
	; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7			; GFX1030-NEXT: v_lshrrev_b32_e32 v6, 16, v7
	; GCN-NEXT: v_and_b32_e32 v15, s0, v9			; GFX1030-NEXT: v_and_b32_e32 v15, s0, v9
	; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9			; GFX1030-NEXT: v_lshrrev_b32_e32 v9, 16, v9
	; GCN-NEXT: v_and_b32_e32 v16, s0, v10			; GFX1030-NEXT: v_and_b32_e32 v16, s0, v10
	; GCN-NEXT: s_mov_b32 s1, exec_lo			; GFX1030-NEXT: s_mov_b32 s1, exec_lo
	; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6			; GFX1030-NEXT: v_lshlrev_b32_e32 v6, 16, v6
	; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15			; GFX1030-NEXT: v_lshlrev_b32_e32 v15, 16, v15
	; GCN-NEXT: v_lshl_or_b32 v16, v16, 16, v9			; GFX1030-NEXT: v_lshl_or_b32 v16, v16, 16, v9
	; GCN-NEXT: v_and_or_b32 v10, v7, s0, v6			; GFX1030-NEXT: v_and_or_b32 v10, v7, s0, v6
	; GCN-NEXT: v_and_or_b32 v15, v8, s0, v15			; GFX1030-NEXT: v_and_or_b32 v15, v8, s0, v15
	; GCN-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1			; GFX1030-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1
	; GCN-NEXT: v_readfirstlane_b32 s4, v11			; GFX1030-NEXT: v_readfirstlane_b32 s4, v11
	; GCN-NEXT: v_readfirstlane_b32 s5, v12			; GFX1030-NEXT: v_readfirstlane_b32 s5, v12
	; GCN-NEXT: v_readfirstlane_b32 s6, v13			; GFX1030-NEXT: v_readfirstlane_b32 s6, v13
	; GCN-NEXT: v_readfirstlane_b32 s7, v14			; GFX1030-NEXT: v_readfirstlane_b32 s7, v14
	; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]			; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
	; GCN-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16			; GFX1030-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16
	; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]			; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
	; GCN-NEXT: s_and_b32 s0, s0, vcc_lo			; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
	; GCN-NEXT: s_and_saveexec_b32 s0, s0			; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
	; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0			; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
	; GCN-NEXT: s_cbranch_execnz BB9_1			; GFX1030-NEXT: s_cbranch_execnz BB9_1
	; GCN-NEXT: ; %bb.2:			; GFX1030-NEXT: ; %bb.2:
	; GCN-NEXT: s_mov_b32 exec_lo, s1			; GFX1030-NEXT: s_mov_b32 exec_lo, s1
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GFX1030-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: v_mov_b32_e32 v0, v6			; GFX1030-NEXT: v_mov_b32_e32 v0, v6
	; GCN-NEXT: v_mov_b32_e32 v1, v7			; GFX1030-NEXT: v_mov_b32_e32 v1, v7
	; GCN-NEXT: v_mov_b32_e32 v2, v8			; GFX1030-NEXT: v_mov_b32_e32 v2, v8
	; GCN-NEXT: v_mov_b32_e32 v3, v9			; GFX1030-NEXT: v_mov_b32_e32 v3, v9
	; GCN-NEXT: ; return to shader part epilog			; GFX1030-NEXT: ; return to shader part epilog
				;
				; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
				; GFX1013: ; %bb.0:
				; GFX1013-NEXT: s_mov_b32 s0, 0xffff
				; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7
				; GFX1013-NEXT: v_and_b32_e32 v15, s0, v9
				; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9
				; GFX1013-NEXT: v_and_b32_e32 v10, s0, v10
				; GFX1013-NEXT: s_mov_b32 s1, exec_lo
				; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6
				; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15
				; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v6
				; GFX1013-NEXT: v_and_or_b32 v7, v8, s0, v15
				; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9
				; GFX1013-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1
				; GFX1013-NEXT: v_readfirstlane_b32 s4, v11
				; GFX1013-NEXT: v_readfirstlane_b32 s5, v12
				; GFX1013-NEXT: v_readfirstlane_b32 s6, v13
				; GFX1013-NEXT: v_readfirstlane_b32 s7, v14
				; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
				; GFX1013-NEXT: s_waitcnt vmcnt(0)
				; GFX1013-NEXT: image_bvh64_intersect_ray v[15:18], v[0:15], s[4:7] a16
				; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
				; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
				; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
				; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
				; GFX1013-NEXT: s_cbranch_execnz BB9_1
				; GFX1013-NEXT: ; %bb.2:
				; GFX1013-NEXT: s_mov_b32 exec_lo, s1
				; GFX1013-NEXT: s_waitcnt vmcnt(0)
				; GFX1013-NEXT: v_mov_b32_e32 v0, v15
				; GFX1013-NEXT: v_mov_b32_e32 v1, v16
				; GFX1013-NEXT: v_mov_b32_e32 v2, v17
				; GFX1013-NEXT: v_mov_b32_e32 v3, v18
				; GFX1013-NEXT: ; return to shader part epilog
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)			%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>			%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r			ret <4 x float> %r
	}			}

	define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {			define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
	; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:			; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
	; GFX1030: ; %bb.0:			; GFX1030: ; %bb.0:
	Show All 27 Lines
	; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]			; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
	; GFX1030-NEXT: s_endpgm			; GFX1030-NEXT: s_endpgm
	;			;
	; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:			; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
	; GFX1013: ; %bb.0:			; GFX1013: ; %bb.0:
	; GFX1013-NEXT: s_clause 0x1			; GFX1013-NEXT: s_clause 0x1
	; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34			; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
	; GFX1013-NEXT: v_lshlrev_b32_e32 v4, 2, v0			; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
	; GFX1013-NEXT: v_mov_b32_e32 v6, 0			; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000
	; GFX1013-NEXT: v_mov_b32_e32 v7, 1.0			; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000
	; GFX1013-NEXT: v_mov_b32_e32 v8, 2.0			; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000
	; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40400000			; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000
	; GFX1013-NEXT: v_mov_b32_e32 v10, 4.0
	; GFX1013-NEXT: v_mov_b32_e32 v11, 0x40a00000
	; GFX1013-NEXT: v_mov_b32_e32 v12, 0x40c00000
	; GFX1013-NEXT: v_mov_b32_e32 v13, 0x40e00000
	; GFX1013-NEXT: v_mov_b32_e32 v14, 0x41000000
	; GFX1013-NEXT: s_waitcnt lgkmcnt(0)			; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
	; GFX1013-NEXT: v_mov_b32_e32 v0, s4			; GFX1013-NEXT: v_mov_b32_e32 v0, s4
	; GFX1013-NEXT: v_mov_b32_e32 v1, s5			; GFX1013-NEXT: v_mov_b32_e32 v1, s5
	; GFX1013-NEXT: v_mov_b32_e32 v2, s6			; GFX1013-NEXT: v_mov_b32_e32 v2, s6
	; GFX1013-NEXT: v_mov_b32_e32 v3, s7			; GFX1013-NEXT: v_mov_b32_e32 v3, s7
	; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4			; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
	; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo			; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
	; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4			; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
				; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0
	; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo			; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
	; GFX1013-NEXT: flat_load_dword v4, v[0:1]			; GFX1013-NEXT: flat_load_dword v0, v[4:5]
	; GFX1013-NEXT: flat_load_dword v5, v[2:3]			; GFX1013-NEXT: flat_load_dword v1, v[2:3]
				; GFX1013-NEXT: v_mov_b32_e32 v2, 0
				; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
				; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
				; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
	; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[4:19], s[8:11]			; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[8:11]
	; GFX1013-NEXT: s_waitcnt vmcnt(0)			; GFX1013-NEXT: s_waitcnt vmcnt(0)
	; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]			; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
	; GFX1013-NEXT: s_endpgm			; GFX1013-NEXT: s_endpgm
	%lid = tail call i32 @llvm.amdgcn.workitem.id.x()			%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
	%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid			%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
	%node_ptr = load i32, i32* %gep_node_ptr, align 4			%node_ptr = load i32, i32* %gep_node_ptr, align 4
	%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid			%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
	%ray_extent = load float, float* %gep_ray, align 4			%ray_extent = load float, float* %gep_ray, align 4
	▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines
	; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]			; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
	; GFX1030-NEXT: s_endpgm			; GFX1030-NEXT: s_endpgm
	;			;
	; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:			; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
	; GFX1013: ; %bb.0:			; GFX1013: ; %bb.0:
	; GFX1013-NEXT: s_clause 0x1			; GFX1013-NEXT: s_clause 0x1
	; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34			; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
	; GFX1013-NEXT: v_lshlrev_b32_e32 v4, 2, v0			; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
	; GFX1013-NEXT: s_movk_i32 s1, 0x4400			; GFX1013-NEXT: s_movk_i32 s1, 0x4400
	; GFX1013-NEXT: s_movk_i32 s2, 0x4200			; GFX1013-NEXT: s_movk_i32 s2, 0x4200
	; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000			; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000
	; GFX1013-NEXT: s_movk_i32 s3, 0x4800			; GFX1013-NEXT: s_movk_i32 s3, 0x4800
	; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000			; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000
	; GFX1013-NEXT: s_lshl_b32 s1, s1, 16			; GFX1013-NEXT: s_lshl_b32 s1, s1, 16
	; GFX1013-NEXT: s_movk_i32 s0, 0x4500			; GFX1013-NEXT: s_movk_i32 s0, 0x4500
	; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000			; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000
	; GFX1013-NEXT: s_or_b32 s1, s2, s1			; GFX1013-NEXT: s_or_b32 s1, s2, s1
	; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000			; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000
	; GFX1013-NEXT: s_lshl_b32 s3, s3, 16			; GFX1013-NEXT: s_lshl_b32 s3, s3, 16
	; GFX1013-NEXT: v_mov_b32_e32 v6, 0
	; GFX1013-NEXT: v_mov_b32_e32 v7, 1.0
	; GFX1013-NEXT: v_mov_b32_e32 v8, 2.0
	; GFX1013-NEXT: v_mov_b32_e32 v9, s1
	; GFX1013-NEXT: s_waitcnt lgkmcnt(0)			; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
	; GFX1013-NEXT: v_mov_b32_e32 v0, s4			; GFX1013-NEXT: v_mov_b32_e32 v0, s4
	; GFX1013-NEXT: v_mov_b32_e32 v1, s5			; GFX1013-NEXT: v_mov_b32_e32 v1, s5
	; GFX1013-NEXT: v_mov_b32_e32 v2, s6			; GFX1013-NEXT: v_mov_b32_e32 v2, s6
	; GFX1013-NEXT: v_mov_b32_e32 v3, s7			; GFX1013-NEXT: v_mov_b32_e32 v3, s7
	; GFX1013-NEXT: s_movk_i32 s5, 0x4600			; GFX1013-NEXT: s_movk_i32 s5, 0x4600
	; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4			; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
	; GFX1013-NEXT: s_movk_i32 s4, 0x4700			; GFX1013-NEXT: s_movk_i32 s4, 0x4700
	; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo			; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
	; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4			; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
	; GFX1013-NEXT: s_bfe_u32 s2, s5, 0x100000			; GFX1013-NEXT: s_bfe_u32 s2, s5, 0x100000
	; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo			; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
	; GFX1013-NEXT: s_lshl_b32 s2, s2, 16			; GFX1013-NEXT: s_lshl_b32 s2, s2, 16
	; GFX1013-NEXT: s_bfe_u32 s4, s4, 0x100000			; GFX1013-NEXT: s_bfe_u32 s4, s4, 0x100000
	; GFX1013-NEXT: s_or_b32 s0, s0, s2			; GFX1013-NEXT: s_or_b32 s0, s0, s2
	; GFX1013-NEXT: flat_load_dword v4, v[0:1]			; GFX1013-NEXT: flat_load_dword v0, v[4:5]
	; GFX1013-NEXT: flat_load_dword v5, v[2:3]			; GFX1013-NEXT: flat_load_dword v1, v[2:3]
	; GFX1013-NEXT: s_or_b32 s2, s4, s3			; GFX1013-NEXT: s_or_b32 s2, s4, s3
	; GFX1013-NEXT: v_mov_b32_e32 v10, s0			; GFX1013-NEXT: v_mov_b32_e32 v2, 0
	; GFX1013-NEXT: v_mov_b32_e32 v11, s2			; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
				; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
				; GFX1013-NEXT: v_mov_b32_e32 v5, s1
				; GFX1013-NEXT: v_mov_b32_e32 v6, s0
				; GFX1013-NEXT: v_mov_b32_e32 v7, s2
	; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[4:11], s[8:11] a16			; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
	; GFX1013-NEXT: s_waitcnt vmcnt(0)			; GFX1013-NEXT: s_waitcnt vmcnt(0)
	; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]			; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
	; GFX1013-NEXT: s_endpgm			; GFX1013-NEXT: s_endpgm
	%lid = tail call i32 @llvm.amdgcn.workitem.id.x()			%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
	%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid			%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
	%node_ptr = load i32, i32* %gep_node_ptr, align 4			%node_ptr = load i32, i32* %gep_node_ptr, align 4
	%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid			%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
	%ray_extent = load float, float* %gep_ray, align 4			%ray_extent = load float, float* %gep_ray, align 4
	▲ Show 20 Lines • Show All 193 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Disable NSA for BVH instructions when appropriate
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 363433

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Disable NSA for BVH instructions when appropriateClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 363433

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

[AMDGPU] Disable NSA for BVH instructions when appropriate
ClosedPublic