Diff 225973

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 633 Lines • ▼ Show 20 Lines	if (FoldingImmLike && UseMI->isCopy()) {
while (ImpOpI != ImpOpE) {		while (ImpOpI != ImpOpE) {
MachineInstr::mop_iterator Tmp = ImpOpI;		MachineInstr::mop_iterator Tmp = ImpOpI;
ImpOpI++;		ImpOpI++;
UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));		UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));
}		}
CopiesToReplace.push_back(UseMI);		CopiesToReplace.push_back(UseMI);
} else {		} else {
if (UseMI->isCopy() && OpToFold.isReg() &&		if (UseMI->isCopy() && OpToFold.isReg() &&
Register::isVirtualRegister(UseMI->getOperand(0).getReg()) &&		UseMI->getOperand(0).getReg().isVirtual() &&
TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&		TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
!UseMI->getOperand(1).getSubReg()) {		!UseMI->getOperand(1).getSubReg()) {
		LLVM_DEBUG(dbgs() << "Folding " << OpToFold
		<< "\n into " << *UseMI << '\n');
unsigned Size = TII->getOpSize(*UseMI, 1);		unsigned Size = TII->getOpSize(*UseMI, 1);
UseMI->getOperand(1).setReg(OpToFold.getReg());		UseMI->getOperand(1).setReg(OpToFold.getReg());
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());		UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
UseMI->getOperand(1).setIsKill(false);		UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);		CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);		OpToFold.setIsKill(false);
if (Size != 4)		if (Size != 4)
return;		return;
▲ Show 20 Lines • Show All 760 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll

Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, [8 x i32], float %y) #0 {
store volatile float %mul2, float addrspace(1)* %out		store volatile float %mul2, float addrspace(1)* %out
store volatile float %mad, float addrspace(1)* %out.gep.1		store volatile float %mad, float addrspace(1)* %out.gep.1
ret void		ret void
}		}

; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f32:		; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f32:
; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], \|[[X:s[0-9]+]]\|, \|s{{[0-9]+}}\|		; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], \|[[X:s[0-9]+]]\|, \|s{{[0-9]+}}\|
; SIVI-DAG: v_mad_f32 [[MAD:v[0-9]+]], \|[[X]]\|, 2.0, v{{[0-9]+}}		; SIVI-DAG: v_mad_f32 [[MAD:v[0-9]+]], \|[[X]]\|, 2.0, v{{[0-9]+}}
; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], \|[[X]]\|, 2.0, s{{[0-9]+}}		; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], 2.0, \|[[X]]\|, v{{[0-9]+}}
; GCN-DAG: buffer_store_dword [[MUL2]]		; GCN-DAG: buffer_store_dword [[MUL2]]
; GCN-DAG: buffer_store_dword [[MAD]]		; GCN-DAG: buffer_store_dword [[MAD]]
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {		define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1		%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
%x.abs = call float @llvm.fabs.f32(float %x)		%x.abs = call float @llvm.fabs.f32(float %x)
%mul2 = fmul fast float %x.abs, 2.0		%mul2 = fmul fast float %x.abs, 2.0
%mad = fadd fast float %mul2, %y		%mad = fadd fast float %mul2, %y
store volatile float %mul2, float addrspace(1)* %out		store volatile float %mul2, float addrspace(1)* %out
store volatile float %mad, float addrspace(1)* %out.gep.1		store volatile float %mad, float addrspace(1)* %out.gep.1
ret void		ret void
}		}

; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32:		; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32:
; SIVI: v_mad_f32 {{v[0-9]+}}, \|[[X:s[0-9]+]]\|, 2.0, v{{[0-9]+}}		; SIVI: v_mad_f32 {{v[0-9]+}}, \|[[X:s[0-9]+]]\|, 2.0, v{{[0-9]+}}
; SIVI: v_mad_f32 {{v[0-9]+}}, \|[[X]]\|, 2.0, v{{[0-9]+}}		; SIVI: v_mad_f32 {{v[0-9]+}}, \|[[X]]\|, 2.0, v{{[0-9]+}}
; GFX10: v_fma_f32 {{v[0-9]+}}, \|[[X:s[0-9]+]]\|, 2.0, {{s[0-9]+}}		; GFX10: v_fma_f32 {{v[0-9]+}}, 2.0, \|[[X:s[0-9]+]]\|, {{v[0-9]+}}
; GFX10: v_fma_f32 {{v[0-9]+}}, \|[[X]]\|, 2.0, {{s[0-9]+}}		; GFX10: v_fma_f32 {{v[0-9]+}}, 2.0, \|[[X]]\|, {{v[0-9]+}}
define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {		define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
		arsenmUnsubmitted Not Done Reply Inline Actions This looks like it got worse? arsenm: This looks like it got worse?
		rampitecAuthorUnsubmitted Done Reply Inline Actions Yes, this is regression specific to fma/mac. The reg class after the folding mismatches xm0/xexec operand definition of fma src. The regression is however small, while some copies are eliminated in other cases. rampitec: Yes, this is regression specific to fma/mac. The reg class after the folding mismatches…
		rampitecAuthorUnsubmitted Done Reply Inline Actions I.e. we should refine how we use sgpr register classes instead of inhibiting folding. rampitec: I.e. we should refine how we use sgpr register classes instead of inhibiting folding.
		arsenmUnsubmitted Done Reply Inline Actions The fma src doesn't use xm0_xexec though? Can you add a testcase with this specific case? I think this should be easily avoidable arsenm: The fma src doesn't use xm0_xexec though? Can you add a testcase with this specific case? I…
		rampitecAuthorUnsubmitted Done Reply Inline Actions It is explicitly disabled in the SIFoldOperands::foldOperand(): // Don't fold subregister extracts into tied operands, only if it is a full // copy since a subregister use tied to a full register def doesn't really // make sense. e.g. don't fold: // // %1 = COPY %0:sub1 // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0> // // into // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0> if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) return; rampitec: It is explicitly disabled in the SIFoldOperands::foldOperand(): ``` // Don't fold…
		rampitecAuthorUnsubmitted Done Reply Inline Actions This code is removed in child D69287 and folding happens again. rampitec: This code is removed in child D69287 and folding happens again.
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1		%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
%x.abs = call float @llvm.fabs.f32(float %x)		%x.abs = call float @llvm.fabs.f32(float %x)
%mul2 = fmul fast float %x.abs, 2.0		%mul2 = fmul fast float %x.abs, 2.0
%mad0 = fadd fast float %mul2, %y		%mad0 = fadd fast float %mul2, %y
%mad1 = fadd fast float %mul2, %z		%mad1 = fadd fast float %mul2, %z
store volatile float %mad0, float addrspace(1)* %out		store volatile float %mad0, float addrspace(1)* %out
store volatile float %mad1, float addrspace(1)* %out.gep.1		store volatile float %mad1, float addrspace(1)* %out.gep.1
ret void		ret void
▲ Show 20 Lines • Show All 165 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir

	# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-fold-operands,dead-mi-elimination %s -o - \| FileCheck -check-prefix=GCN %s			# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-fold-operands,dead-mi-elimination -verify-machineinstrs %s -o - \| FileCheck -check-prefix=GCN %s

	# GCN-LABEL: name: fold_vgpr_copy			# GCN-LABEL: name: fold_vgpr_to_vgpr_copy
	# GCN: %0:vreg_64 = IMPLICIT_DEF			# GCN: %0:vreg_64 = IMPLICIT_DEF
	# GCN-NEXT: %4:vgpr_32 = IMPLICIT_DEF			# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
	# GCN-NEXT: %3:vgpr_32 = IMPLICIT_DEF			# GCN-NEXT: %2:vgpr_32 = IMPLICIT_DEF
	# GCN-NEXT: DS_WRITE2_B32_gfx9 %0.sub0, killed %4, killed %3, 0, 1, 0, implicit $exec			# GCN-NEXT: DS_WRITE2_B32_gfx9 %0.sub0, killed %1, killed %2, 0, 1, 0, implicit $exec

	---			---
	name: fold_vgpr_copy			name: fold_vgpr_to_vgpr_copy
	registers:
	- { id: 0, class: vreg_64 }
	- { id: 1, class: vgpr_32 }
	- { id: 2, class: vgpr_32 }
	- { id: 3, class: vgpr_32 }
	- { id: 4, class: vgpr_32 }
	body: \|			body: \|
	bb.0:			bb.0:

	%0:vreg_64 = IMPLICIT_DEF			%0:vreg_64 = IMPLICIT_DEF
	%4 = IMPLICIT_DEF			%4:vgpr_32 = IMPLICIT_DEF
	%3 = IMPLICIT_DEF			%3:vgpr_32 = IMPLICIT_DEF
	%1:vgpr_32 = COPY %0.sub0			%1:vgpr_32 = COPY %0.sub0
	%2:vgpr_32 = COPY %1			%2:vgpr_32 = COPY %1
	DS_WRITE2_B32_gfx9 %2, killed %4, killed %3, 0, 1, 0, implicit $exec			DS_WRITE2_B32_gfx9 %2, killed %4, killed %3, 0, 1, 0, implicit $exec
				...

				# GCN-LABEL: name: fold_sgpr_to_vgpr_copy
				# GCN: %0:sreg_64 = IMPLICIT_DEF
				# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
				# GCN-NEXT: %2:vgpr_32 = IMPLICIT_DEF
				# GCN-NEXT: %4:vgpr_32 = COPY %0.sub0
				# GCN-NEXT: DS_WRITE2_B32_gfx9 %4, killed %1, killed %2, 0, 1, 0, implicit $exec
				name: fold_sgpr_to_vgpr_copy
				body: \|
				bb.0:

				%0:sreg_64 = IMPLICIT_DEF
				%4:vgpr_32 = IMPLICIT_DEF
				%3:vgpr_32 = IMPLICIT_DEF
				%1:sgpr_32 = COPY %0.sub0
				%2:vgpr_32 = COPY %1
				DS_WRITE2_B32_gfx9 %2, killed %4, killed %3, 0, 1, 0, implicit $exec
				...

				# FIXME: src2 of fmac should use scalar register.
				# However, after %0.sub1 is folded into %3 COPY it is not considered for folding anymore.

				# GCN-LABEL: name: fma_sgpr_use
				# GCN: %0:sreg_64_xexec = IMPLICIT_DEF
				# GCN-NEXT: %3:vgpr_32 = COPY %0.sub1
				# GCN-NEXT: %4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMAC_F32_e64 2, %0.sub0, 0, 1073741824, 0, %3, 0, 0, implicit $exec
				---
				name: fma_sgpr_use
				body: \|
				bb.0:
				%0:sreg_64_xexec = IMPLICIT_DEF
				%1:sgpr_32 = COPY %0.sub0
				%2:sgpr_32 = COPY %0.sub1
				%3:vgpr_32 = COPY %2
				%4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMAC_F32_e64 2, %1, 0, 1073741824, 0, %3, 0, 0, implicit $exec
				DS_WRITE2_B32_gfx9 undef %5:vgpr_32, killed %4, undef %6:vgpr_32, 0, 1, 0, implicit $exec
	...			...

llvm/test/CodeGen/AMDGPU/saddo.ll

	Show All 34 Lines
	; SI-NEXT: s_endpgm			; SI-NEXT: s_endpgm
	;			;
	; VI-LABEL: saddo_i64_zext:			; VI-LABEL: saddo_i64_zext:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v1, s6			; VI-NEXT: v_mov_b32_e32 v1, s6
	; VI-NEXT: s_add_u32 s8, s6, s0			; VI-NEXT: s_add_u32 s2, s6, s0
	; VI-NEXT: s_addc_u32 s9, s7, s1			; VI-NEXT: s_addc_u32 s3, s7, s1
	; VI-NEXT: v_mov_b32_e32 v2, s7			; VI-NEXT: v_mov_b32_e32 v2, s7
	; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2]			; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
	; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0			; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
	; VI-NEXT: v_mov_b32_e32 v3, s9			; VI-NEXT: v_mov_b32_e32 v3, s3
	; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc			; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
	; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]			; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
	; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2			; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc			; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
	; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]			; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	;			;
	; GFX9-LABEL: saddo_i64_zext:			; GFX9-LABEL: saddo_i64_zext:
	; GFX9: ; %bb.0:			; GFX9: ; %bb.0:
	; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: v_mov_b32_e32 v1, s6			; GFX9-NEXT: v_mov_b32_e32 v1, s6
	; GFX9-NEXT: s_add_u32 s8, s6, s0			; GFX9-NEXT: s_add_u32 s2, s6, s0
	; GFX9-NEXT: s_addc_u32 s9, s7, s1			; GFX9-NEXT: s_addc_u32 s3, s7, s1
	; GFX9-NEXT: v_mov_b32_e32 v2, s7			; GFX9-NEXT: v_mov_b32_e32 v2, s7
	; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[1:2]			; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
	; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0			; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
	; GFX9-NEXT: v_mov_b32_e32 v3, s9			; GFX9-NEXT: v_mov_b32_e32 v3, s3
	; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc			; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
	; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]			; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
	; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2			; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
	; GFX9-NEXT: v_mov_b32_e32 v0, s4			; GFX9-NEXT: v_mov_b32_e32 v0, s4
	; GFX9-NEXT: v_mov_b32_e32 v1, s5			; GFX9-NEXT: v_mov_b32_e32 v1, s5
	; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc			; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
	; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off			; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	%sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind			%sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
	%val = extractvalue { i64, i1 } %sadd, 0			%val = extractvalue { i64, i1 } %sadd, 0
	%carry = extractvalue { i64, i1 } %sadd, 1			%carry = extractvalue { i64, i1 } %sadd, 1
	▲ Show 20 Lines • Show All 432 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Allow folding of sgpr to vgpr copy
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 225973

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll

llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir

llvm/test/CodeGen/AMDGPU/saddo.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Allow folding of sgpr to vgpr copyClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 225973

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll

llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir

llvm/test/CodeGen/AMDGPU/saddo.ll

[AMDGPU] Allow folding of sgpr to vgpr copy
ClosedPublic