This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Move v_readlane lane select from VGPR to SGPR
ClosedPublic

Authored by nhaehnle on Apr 21 2017, 2:21 AM.

Download Raw Diff

Details

Reviewers

Commits

rG5dea6451389b: AMDGPU: Move v_readlane lane select from VGPR to SGPR
rL301197: AMDGPU: Move v_readlane lane select from VGPR to SGPR

Summary

Fix a compiler bug when the lane select happens to end up in a VGPR.

Clarify the semantic of the corresponding intrinsic to be that of
the corresponding GLSL: the lane select must be uniform across a
wave front, otherwise results are undefined.

Diff Detail

Repository: rL LLVM

Event Timeline

nhaehnle created this revision.Apr 21 2017, 2:21 AM

Herald added subscribers: t-tye, tpr, dstuttard and 3 others. · View Herald TranscriptApr 21 2017, 2:21 AM

arsenm added inline comments.Apr 21 2017, 11:57 AM

lib/Target/AMDGPU/SIInstrInfo.cpp
2639 ↗	(On Diff #96111)	Also mention the source is assumed to be uniform?
2642 ↗	(On Diff #96111)	SReg32_XM0
2643 ↗	(On Diff #96111)	const reference
test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
26 ↗	(On Diff #96111)	Can you add a GEP on workitem ID to ensure the scalar load optimization won't ever trigger on this

Address review comments.

LGTM

This revision is now accepted and ready to land.Apr 24 2017, 10:27 AM

Closed by commit rL301197: AMDGPU: Move v_readlane lane select from VGPR to SGPR (authored by nha). · Explain WhyApr 24 2017, 10:30 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

include/

llvm/

IR/

IntrinsicsAMDGPU.td

2 lines

lib/

Target/

AMDGPU/

SIInstrInfo.cpp

13 lines

test/

CodeGen/

AMDGPU/

llvm.amdgcn.readlane.ll

17 lines

Diff 96422

llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td

	Show First 20 Lines • Show All 623 Lines • ▼ Show 20 Lines
	def int_amdgcn_fcmp :			def int_amdgcn_fcmp :
	Intrinsic<[llvm_i64_ty], [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_i32_ty],			Intrinsic<[llvm_i64_ty], [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_i32_ty],
	[IntrNoMem, IntrConvergent]>;			[IntrNoMem, IntrConvergent]>;

	def int_amdgcn_readfirstlane :			def int_amdgcn_readfirstlane :
	GCCBuiltin<"__builtin_amdgcn_readfirstlane">,			GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;			Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

				// The lane argument must be uniform across the currently active threads of the
				// current wave. Otherwise, the result is undefined.
	def int_amdgcn_readlane :			def int_amdgcn_readlane :
	GCCBuiltin<"__builtin_amdgcn_readlane">,			GCCBuiltin<"__builtin_amdgcn_readlane">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;			Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// CI+ Intrinsics			// CI+ Intrinsics
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	▲ Show 20 Lines • Show All 80 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 2,634 Lines • ▼ Show 20 Lines	if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
legalizeOpWithMove(MI, Src0Idx);		legalizeOpWithMove(MI, Src0Idx);
}		}

// VOP2 src0 instructions support all operand types, so we don't need to check		// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.		// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))		if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
return;		return;

		// Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
		// lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
		// select is uniform.
		if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
		RI.isVGPR(MRI, Src1.getReg())) {
		unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
		const DebugLoc &DL = MI.getDebugLoc();
		BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
		.add(Src1);
		Src1.ChangeToRegister(Reg, false);
		return;
		}

// We do not use commuteInstruction here because it is too aggressive and will		// We do not use commuteInstruction here because it is too aggressive and will
// commute if it is possible. We only want to commute here if it improves		// commute if it is possible. We only want to commute here if it improves
// legality. This can be called a fairly large number of times so don't waste		// legality. This can be called a fairly large number of times so don't waste
// compile time pointlessly swapping and checking legality again.		// compile time pointlessly swapping and checking legality again.
if (HasImplicitSGPR \|\| !MI.isCommutable()) {		if (HasImplicitSGPR \|\| !MI.isCommutable()) {
legalizeOpWithMove(MI, Src1Idx);		legalizeOpWithMove(MI, Src1Idx);
return;		return;
}		}
▲ Show 20 Lines • Show All 1,300 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll

	Show All 13 Lines
	; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32			; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
	; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}			; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
	define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {			define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
	%readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)			%readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
	store i32 %readlane, i32 addrspace(1)* %out, align 4			store i32 %readlane, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

				; CHECK-LABEL: {{^}}test_readlane_vregs:
				; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
				; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
				define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
				%args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
				%value = extractelement <2 x i32> %args, i32 0
				%lane = extractelement <2 x i32> %args, i32 1
				%readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
				store i32 %readlane, i32 addrspace(1)* %out, align 4
				ret void
				}

	; TODO: m0 should be folded.			; TODO: m0 should be folded.
	; CHECK-LABEL: {{^}}test_readlane_m0_sreg:			; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
	; CHECK: s_mov_b32 m0, -1			; CHECK: s_mov_b32 m0, -1
	; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0			; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
	; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]			; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
	; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}			; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
	define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {			define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
	%m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()			%m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
	%readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)			%readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
	store i32 %readlane, i32 addrspace(1)* %out, align 4			store i32 %readlane, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; CHECK-LABEL: {{^}}test_readlane_imm:			; CHECK-LABEL: {{^}}test_readlane_imm:
	; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32			; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
	define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {			define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
	%readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 32) #0			%readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 32) #0
	store i32 %readlane, i32 addrspace(1)* %out, align 4			store i32 %readlane, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

				declare i32 @llvm.amdgcn.workitem.id.x() #2

	attributes #0 = { nounwind readnone convergent }			attributes #0 = { nounwind readnone convergent }
	attributes #1 = { nounwind }			attributes #1 = { nounwind }
				attributes #2 = { nounwind readnone }