This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Force shrinking of add/sub even if the carry is used
ClosedPublic

Authored by arsenm on Aug 28 2018, 4:54 AM.

Download Raw Diff

Details

Reviewers

Summary

The original motivating example uses a 64-bit add, so the carry
is used. Insert a copy from VCC. This may allow shrinking of
the used carry instruction. At worst, we are replacing a
mov to materialize the constant with a copy of vcc.

Diff Detail

Event Timeline

arsenm created this revision.Aug 28 2018, 4:54 AM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptAug 28 2018, 4:54 AM

arsenm added a child revision: D51350: CodeGen: Make computeRegisterLiveness search forward first.Aug 28 2018, 4:57 AM

I am not sure that a copy + e32 instruction is better than a single e64 instruction. In fact I think it is worse.

In D51347#1216006, @rampitec wrote:

I am not sure that a copy + e32 instruction is better than a single e64 instruction. In fact I think it is worse.

In practice the copy is always eliminated since its usually paired with a carry in operation. The total cycle count is the same with reduced code size

In D51347#1216048, @arsenm wrote:

In D51347#1216006, @rampitec wrote:

I am not sure that a copy + e32 instruction is better than a single e64 instruction. In fact I think it is worse.

In practice the copy is always eliminated since its usually paired with a carry in operation. The total cycle count is the same with reduced code size

Usually the add is shrunk, so it’s more code size neutral

LGTM

This revision is now accepted and ready to land.Aug 28 2018, 10:06 AM

r340862

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIFoldOperands.cpp

13 lines

test/

CodeGen/

AMDGPU/

fold-immediate-operand-shrink-with-carry.mir

35 lines

Diff 162829

lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 202 Lines • ▼ Show 20 Lines	if (Fold.isImm()) {
}		}

if (Fold.needsShrink()) {		if (Fold.needsShrink()) {
MachineBasicBlock *MBB = MI->getParent();		MachineBasicBlock *MBB = MI->getParent();
auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);		auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
if (Liveness != MachineBasicBlock::LQR_Dead)		if (Liveness != MachineBasicBlock::LQR_Dead)
return false;		return false;

		MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
int Op32 = Fold.getShrinkOpcode();		int Op32 = Fold.getShrinkOpcode();
MachineOperand &Dst0 = MI->getOperand(0);		MachineOperand &Dst0 = MI->getOperand(0);
MachineOperand &Dst1 = MI->getOperand(1);		MachineOperand &Dst1 = MI->getOperand(1);
assert(Dst0.isDef() && Dst1.isDef());		assert(Dst0.isDef() && Dst1.isDef());

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();		bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());

const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());		const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);		unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());		const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());
unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);		unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);

MachineInstr Inst32 = TII.buildShrunkInst(MI, Op32);		MachineInstr Inst32 = TII.buildShrunkInst(MI, Op32);

		if (HaveNonDbgCarryUse) {
		BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
		.addReg(AMDGPU::VCC, RegState::Kill);
		}

// Keep the old instruction around to avoid breaking iterators, but		// Keep the old instruction around to avoid breaking iterators, but
// replace the outputs with dummy registers.		// replace the outputs with dummy registers.
Dst0.setReg(NewReg0);		Dst0.setReg(NewReg0);
Dst1.setReg(NewReg1);		Dst1.setReg(NewReg1);

if (Fold.isCommuted())		if (Fold.isCommuted())
TII.commuteInstruction(*Inst32, false);		TII.commuteInstruction(*Inst32, false);
return true;		return true;
▲ Show 20 Lines • Show All 114 Lines • ▼ Show 20 Lines	if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
MachineOperand &OtherOp = MI->getOperand(OtherIdx);		MachineOperand &OtherOp = MI->getOperand(OtherIdx);
if (!OtherOp.isReg() \|\|		if (!OtherOp.isReg() \|\|
!TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))		!TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
return false;		return false;

const MachineOperand &SDst = MI->getOperand(1);		const MachineOperand &SDst = MI->getOperand(1);
assert(SDst.isDef());		assert(SDst.isDef());

// TODO: Handle cases with a used carry.
if (!MRI.use_nodbg_empty(SDst.getReg()))
return false;

int Op32 = AMDGPU::getVOPe32(Opc);		int Op32 = AMDGPU::getVOPe32(Opc);
FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,		FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
Op32));		Op32));
return true;		return true;
}		}

TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);		TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
return false;		return false;
▲ Show 20 Lines • Show All 752 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir

	# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py			# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
	# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination %s -o - \| FileCheck -check-prefix=GCN %s			# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination %s -o - \| FileCheck -check-prefix=GCN %s

	---			---

	# Uses a carry out in an instruction that can't be shrunk.			# Uses a carry out in an instruction that can't be shrunk.

	name: shrink_scalar_imm_vgpr_v_add_i32_e64_other_carry_out_use			name: shrink_scalar_imm_vgpr_v_add_i32_e64_other_carry_out_use
	tracksRegLiveness: true			tracksRegLiveness: true

	body: \|			body: \|
	bb.0:			bb.0:
	; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_other_carry_out_use			; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_other_carry_out_use
	; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345			; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
	; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF			; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
	; GCN: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[DEF]], [[S_MOV_B32_]], implicit $exec			; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
	; GCN: S_ENDPGM implicit [[V_ADD_I32_e64_1]]			; GCN: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc
				; GCN: S_ENDPGM implicit [[COPY]]
	%0:sreg_32_xm0 = S_MOV_B32 12345			%0:sreg_32_xm0 = S_MOV_B32 12345
	%1:vgpr_32 = IMPLICIT_DEF			%1:vgpr_32 = IMPLICIT_DEF
	%2:vgpr_32 = IMPLICIT_DEF			%2:vgpr_32 = IMPLICIT_DEF
	%3:vgpr_32 = IMPLICIT_DEF			%3:vgpr_32 = IMPLICIT_DEF

	%4:vgpr_32, %5:sreg_64_xexec = V_ADD_I32_e64 %0, %1, implicit $exec			%4:vgpr_32, %5:sreg_64_xexec = V_ADD_I32_e64 %0, %1, implicit $exec
	S_ENDPGM implicit %5			S_ENDPGM implicit %5

	...			...
	---			---

				name: shrink_scalar_imm_multi_use_with_used_carry
				tracksRegLiveness: true

				body: \|
				bb.0:
				; GCN-LABEL: name: shrink_scalar_imm_multi_use_with_used_carry
				; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
				; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
				; GCN: [[V_ADD_I32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[S_MOV_B32_]], [[DEF1]], implicit $exec
				; GCN: S_ENDPGM implicit [[V_ADD_I32_e64_1]], implicit [[V_ADD_I32_e64_2]]
				%0:sreg_32_xm0 = S_MOV_B32 12345
				%1:vgpr_32 = IMPLICIT_DEF
				%2:vgpr_32 = IMPLICIT_DEF
				%3:vgpr_32 = IMPLICIT_DEF
				%4:vgpr_32 = IMPLICIT_DEF

				%5:vgpr_32, %6:sreg_64_xexec = V_ADD_I32_e64 %0, %1, implicit $exec
				%7:vgpr_32, %8:sreg_64_xexec = V_ADD_I32_e64 %0, %2, implicit $exec
				S_ENDPGM implicit %6, implicit %7

				...
				---

	# TODO: Is it OK to leave the broken use around on the DBG_VALUE?			# TODO: Is it OK to leave the broken use around on the DBG_VALUE?

	name: shrink_scalar_imm_vgpr_v_add_i32_e64_dbg_only_carry_out_use			name: shrink_scalar_imm_vgpr_v_add_i32_e64_dbg_only_carry_out_use
	tracksRegLiveness: true			tracksRegLiveness: true

	body: \|			body: \|
	bb.0:			bb.0:
	; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_dbg_only_carry_out_use			; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_dbg_only_carry_out_use
	Show All 22 Lines

	body: \|			body: \|
	bb.0:			bb.0:
	; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_carry_out_use			; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_add_i32_e64_carry_out_use
	; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345			; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
	; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF			; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
	; GCN: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF			; GCN: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
	; GCN: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF			; GCN: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
	; GCN: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[DEF]], [[S_MOV_B32_]], implicit $exec			; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
	; GCN: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF1]], [[DEF2]], [[V_ADD_I32_e64_1]], implicit $exec			; GCN: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc
				; GCN: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF1]], [[DEF2]], [[COPY]], implicit $exec
	; GCN: S_ENDPGM implicit [[V_ADDC_U32_e64_]]			; GCN: S_ENDPGM implicit [[V_ADDC_U32_e64_]]
	%0:sreg_32_xm0 = S_MOV_B32 12345			%0:sreg_32_xm0 = S_MOV_B32 12345
	%1:vgpr_32 = IMPLICIT_DEF			%1:vgpr_32 = IMPLICIT_DEF
	%2:vgpr_32 = IMPLICIT_DEF			%2:vgpr_32 = IMPLICIT_DEF
	%3:vgpr_32 = IMPLICIT_DEF			%3:vgpr_32 = IMPLICIT_DEF

	%4:vgpr_32, %5:sreg_64_xexec = V_ADD_I32_e64 %0, %1, implicit $exec			%4:vgpr_32, %5:sreg_64_xexec = V_ADD_I32_e64 %0, %1, implicit $exec
	%6:vgpr_32, %7:sreg_64_xexec = V_ADDC_U32_e64 %2, %3, %5, implicit $exec			%6:vgpr_32, %7:sreg_64_xexec = V_ADDC_U32_e64 %2, %3, %5, implicit $exec
	S_ENDPGM implicit %6			S_ENDPGM implicit %6

	...			...