Diff 275370

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

Show First 20 Lines • Show All 355 Lines • ▼ Show 20 Lines	bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
assert(Imm->isImm());		assert(Imm->isImm());
return (Imm->getImm() & Mask) == Value;		return (Imm->getImm() & Mask) == Value;
}		}

bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {		bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);		assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);		LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);

auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);		auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
		rampitecUnsubmitted Not Done Reply Inline Actions I assume this exploits the fact we have no instructions which may have more than two uses and are dpp combinable at the same time? If so it deserves a comment. rampitec: I assume this exploits the fact we have no instructions which may have more than two uses and…
		vpykhtinAuthorUnsubmitted Done Reply Inline Actions I'm not sure what do you mean - DPP extension can only be applied to src0 operand. addUse is called in the order of defuse chain traversal: defusechain_iterator::operator++ assumes that all per instruction uses comes continuosly - so I decided to assume the same. vpykhtin: I'm not sure what do you mean - DPP extension can only be applied to src0 operand. addUse is…
		rampitecUnsubmitted Not Done Reply Inline Actions Hm... This looks like a hack to me. rampitec: Hm... This looks like a hack to me.
		vpykhtinAuthorUnsubmitted Done Reply Inline Actions It's a cornerstone for MachineRegisterInfo's use_instr_.. iterators, otherwise it wouldn't be cheap to walk. vpykhtin: It's a cornerstone for MachineRegisterInfo's use_instr_.. iterators, otherwise it wouldn't be…
assert(DstOpnd && DstOpnd->isReg());		assert(DstOpnd && DstOpnd->isReg());
auto DPPMovReg = DstOpnd->getReg();		auto DPPMovReg = DstOpnd->getReg();
if (DPPMovReg.isPhysical()) {		if (DPPMovReg.isPhysical()) {
LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n");		LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n");
return false;		return false;
}		}
if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {		if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"		LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
▲ Show 20 Lines • Show All 135 Lines • ▼ Show 20 Lines	if (TII->isVOP3(OrigOp)) {
LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");		LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
break;		break;
}		}
} else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {		} else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");		LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
break;		break;
}		}

		auto *Src0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0);
		auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
		if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1]
		LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
		break;
		}
		foadUnsubmitted Not Done Reply Inline Actions This doesn't seem right. Use could be Src2 or some other operand, couldn't it? foad: This doesn't seem right. Use could be Src2 or some other operand, couldn't it?
		vpykhtinAuthorUnsubmitted Done Reply Inline Actions I cound't find example for this, but ok, lets check this too. vpykhtin: I cound't find example for this, but ok, lets check this too.

		assert(Src0 && "Src1 without Src0?");
		if (Src1 && Src1->isIdenticalTo(*Src0)) {
		assert(Src1->isReg());
		foadUnsubmitted Not Done Reply Inline Actions Could use MachineOperand::isIdenticalTo ? foad: Could use MachineOperand::isIdenticalTo ?
		foadUnsubmitted Not Done Reply Inline Actions This looks technically OK now. I still think it would be cleaner to structure the code as: if (Use == Src0) { // do it } else if (Use == Src1 && commutable && Src0 not identical to Src1) { // commute and do it } else { // fail } But I'll leave that up to your judgement. foad: This looks technically OK now. I still think it would be cleaner to structure the code as: ```…
		vpykhtinAuthorUnsubmitted Done Reply Inline Actions I think currently we have better debug messages. vpykhtin: I think currently we have better debug messages.
		LLVM_DEBUG(
		dbgs()
		<< " " << OrigMI
		<< " failed: DPP register is used more than once per instruction\n");
		break;
		}

LLVM_DEBUG(dbgs() << " combining: " << OrigMI);		LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {		if (Use == Src0) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,		if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
OldOpndValue, CombBCZ)) {		OldOpndValue, CombBCZ)) {
DPPMIs.push_back(DPPInst);		DPPMIs.push_back(DPPInst);
Rollback = false;		Rollback = false;
}		}
} else if (OrigMI.isCommutable() &&		} else {
Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {		assert(Use == Src1 && OrigMI.isCommutable()); // by check [1]
		foadUnsubmitted Not Done Reply Inline Actions It would be much simpler to add a check here that src0 and src1 are not the same register. foad: It would be much simpler to add a check here that src0 and src1 are not the same register.
auto *BB = OrigMI.getParent();		auto *BB = OrigMI.getParent();
auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);		auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
BB->insert(OrigMI, NewMI);		BB->insert(OrigMI, NewMI);
if (TII->commuteInstruction(*NewMI)) {		if (TII->commuteInstruction(*NewMI)) {
LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);		LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
if (auto DPPInst = createDPPInst(NewMI, MovMI, CombOldVGPR,		if (auto DPPInst = createDPPInst(NewMI, MovMI, CombOldVGPR,
OldOpndValue, CombBCZ)) {		OldOpndValue, CombBCZ)) {
DPPMIs.push_back(DPPInst);		DPPMIs.push_back(DPPInst);
Rollback = false;		Rollback = false;
}		}
} else		} else
LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");		LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
NewMI->eraseFromParent();		NewMI->eraseFromParent();
} else		}
LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
if (Rollback)		if (Rollback)
break;		break;
OrigMIs.push_back(&OrigMI);		OrigMIs.push_back(&OrigMI);
}		}

Rollback \|= !Uses.empty();		Rollback \|= !Uses.empty();

for (auto MI : (Rollback? &DPPMIs : &OrigMIs))		for (auto MI : (Rollback? &DPPMIs : &OrigMIs))
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/dpp_combine.mir

Show First 20 Lines • Show All 827 Lines • ▼ Show 20 Lines	bb.0:
%1:vgpr_32 = COPY $vgpr1		%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = IMPLICIT_DEF		%2:vgpr_32 = IMPLICIT_DEF

%3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec		%3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
%4:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec		%4:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0, implicit %4		S_ENDPGM 0, implicit %4

...		...

		# GCN-LABEL: name: dont_combine_more_than_one_operand
		# GCN: %3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
		name: dont_combine_more_than_one_operand
		tracksRegLiveness: true
		body: \|
		bb.0:
		liveins: $vgpr0, $vgpr1
		%0:vgpr_32 = COPY $vgpr0
		%1:vgpr_32 = COPY $vgpr1
		%2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
		%3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
		...

		# GCN-LABEL: name: dont_combine_more_than_one_operand_dpp_reg_sequence
		# GCN: %5:vgpr_32 = V_ADD_I32_e32 %4.sub0, %4.sub0, implicit-def $vcc, implicit $exec
		# GCN: %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec
		name: dont_combine_more_than_one_operand_dpp_reg_sequence
		tracksRegLiveness: true
		body: \|
		bb.0:
		liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
		%0:vreg_64 = COPY $vgpr0_vgpr1
		%1:vreg_64 = COPY $vgpr2_vgpr3
		%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
		%3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec
		%4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
		%5:vgpr_32 = V_ADD_I32_e32 %4.sub0, %4.sub0, implicit-def $vcc, implicit $exec
		%6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec
		...

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Don't combine DPP if DPP register is used more than once per instruction
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 275370

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

llvm/test/CodeGen/AMDGPU/dpp_combine.mir

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Don't combine DPP if DPP register is used more than once per instructionClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 275370

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

llvm/test/CodeGen/AMDGPU/dpp_combine.mir

[AMDGPU] Don't combine DPP if DPP register is used more than once per instruction
ClosedPublic