This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Move v_readlane lane select from VGPR to SGPR
ClosedPublic

Authored by nhaehnle on Apr 21 2017, 2:21 AM.

Download Raw Diff

Details

Reviewers

Commits

rG5dea6451389b: AMDGPU: Move v_readlane lane select from VGPR to SGPR
rL301197: AMDGPU: Move v_readlane lane select from VGPR to SGPR

Summary

Fix a compiler bug when the lane select happens to end up in a VGPR.

Clarify the semantic of the corresponding intrinsic to be that of
the corresponding GLSL: the lane select must be uniform across a
wave front, otherwise results are undefined.

Diff Detail

Build Status

Buildable 5740
Build 5740: arc lint + arc unit

Event Timeline

nhaehnle created this revision.Apr 21 2017, 2:21 AM

Herald added subscribers: t-tye, tpr, dstuttard and 3 others. · View Herald TranscriptApr 21 2017, 2:21 AM

arsenm added inline comments.Apr 21 2017, 11:57 AM

lib/Target/AMDGPU/SIInstrInfo.cpp
2639	Also mention the source is assumed to be uniform?
2642	SReg32_XM0
2643	const reference
test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
26	Can you add a GEP on workitem ID to ensure the scalar load optimization won't ever trigger on this

Address review comments.

LGTM

This revision is now accepted and ready to land.Apr 24 2017, 10:27 AM

Closed by commit rL301197: AMDGPU: Move v_readlane lane select from VGPR to SGPR (authored by nha). · Explain WhyApr 24 2017, 10:30 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

include/

llvm/

IR/

IntrinsicsAMDGPU.td

2 lines

lib/

Target/

AMDGPU/

SIInstrInfo.cpp

12 lines

test/

CodeGen/

AMDGPU/

llvm.amdgcn.readlane.ll

12 lines

Diff 96111

include/llvm/IR/IntrinsicsAMDGPU.td

	Show First 20 Lines • Show All 623 Lines • ▼ Show 20 Lines
	def int_amdgcn_fcmp :			def int_amdgcn_fcmp :
	Intrinsic<[llvm_i64_ty], [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_i32_ty],			Intrinsic<[llvm_i64_ty], [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_i32_ty],
	[IntrNoMem, IntrConvergent]>;			[IntrNoMem, IntrConvergent]>;

	def int_amdgcn_readfirstlane :			def int_amdgcn_readfirstlane :
	GCCBuiltin<"__builtin_amdgcn_readfirstlane">,			GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;			Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

				// The lane argument must be uniform across the currently active threads of the
				// current wave. Otherwise, the result is undefined.
	def int_amdgcn_readlane :			def int_amdgcn_readlane :
	GCCBuiltin<"__builtin_amdgcn_readlane">,			GCCBuiltin<"__builtin_amdgcn_readlane">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;			Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// CI+ Intrinsics			// CI+ Intrinsics
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	▲ Show 20 Lines • Show All 80 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 2,629 Lines • ▼ Show 20 Lines	if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
legalizeOpWithMove(MI, Src0Idx);		legalizeOpWithMove(MI, Src0Idx);
}		}

// VOP2 src0 instructions support all operand types, so we don't need to check		// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.		// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))		if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
return;		return;

		// Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
		// lane select.
		arsenmUnsubmitted Not Done Reply Inline Actions Also mention the source is assumed to be uniform? arsenm: Also mention the source is assumed to be uniform?
		if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
		RI.isVGPR(MRI, Src1.getReg())) {
		unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
		arsenmUnsubmitted Not Done Reply Inline Actions SReg32_XM0 arsenm: SReg32_XM0
		DebugLoc DL = MI.getDebugLoc();
		arsenmUnsubmitted Not Done Reply Inline Actions const reference arsenm: const reference
		BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
		.add(Src1);
		Src1.ChangeToRegister(Reg, false);
		return;
		}

// We do not use commuteInstruction here because it is too aggressive and will		// We do not use commuteInstruction here because it is too aggressive and will
// commute if it is possible. We only want to commute here if it improves		// commute if it is possible. We only want to commute here if it improves
// legality. This can be called a fairly large number of times so don't waste		// legality. This can be called a fairly large number of times so don't waste
// compile time pointlessly swapping and checking legality again.		// compile time pointlessly swapping and checking legality again.
if (HasImplicitSGPR \|\| !MI.isCommutable()) {		if (HasImplicitSGPR \|\| !MI.isCommutable()) {
legalizeOpWithMove(MI, Src1Idx);		legalizeOpWithMove(MI, Src1Idx);
return;		return;
}		}
▲ Show 20 Lines • Show All 1,301 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll

	Show All 13 Lines
	; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32			; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
	; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}			; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
	define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {			define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
	%readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)			%readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
	store i32 %readlane, i32 addrspace(1)* %out, align 4			store i32 %readlane, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

				; CHECK-LABEL: {{^}}test_readlane_vregs:
				; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
				; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
				define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
				%args = load <2 x i32>, <2 x i32> addrspace(1)* %in
				arsenmUnsubmitted Not Done Reply Inline Actions Can you add a GEP on workitem ID to ensure the scalar load optimization won't ever trigger on this arsenm: Can you add a GEP on workitem ID to ensure the scalar load optimization won't ever trigger on…
				%value = extractelement <2 x i32> %args, i32 0
				%lane = extractelement <2 x i32> %args, i32 1
				%readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
				store i32 %readlane, i32 addrspace(1)* %out, align 4
				ret void
				}

	; TODO: m0 should be folded.			; TODO: m0 should be folded.
	; CHECK-LABEL: {{^}}test_readlane_m0_sreg:			; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
	; CHECK: s_mov_b32 m0, -1			; CHECK: s_mov_b32 m0, -1
	; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0			; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
	; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]			; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
	; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}			; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
	define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {			define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
	%m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()			%m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
	Show All 15 Lines