This is an archive of the discontinued LLVM Phabricator instance.

[Kryo] Cannonicalize commutative operands based on LSLFast
AbandonedPublic

Authored by haicheng on Apr 3 2016, 10:53 PM.

Download Raw Diff

Details

Reviewers

gberry
mssimpso
mcrosier

Summary

LSLFast - a logical shift left up to 3 places.

In Kryo if an commutative instruction has a LSL for both operands and if the LSL can be folded into the instruction's shifted register (e.g., add x0, x1, x2, lsl #3) then we should canonicalize the operands so the smaller (in terms of the number of shifts) is the operands that is folded.

For example, rather than

lsl x1, x1, #1
add x0, x1, x2, lsl #4

we should prefer

lsl x2, x2, #4
add x0, x2, x1, lsl #1

as this safes a cycle on the add instruction.

I will add commutative instructions after this patch is approved.

Diff Detail

Repository: rL LLVM

Event Timeline

haicheng updated this revision to Diff 52521.Apr 3 2016, 10:53 PM

haicheng retitled this revision from to [Kryo] Cannonicalize commutative operands based on LSLFast.

haicheng updated this object.

haicheng added reviewers: mcrosier, gberry, mssimpso.

haicheng set the repository for this revision to rL LLVM.

Herald added a subscriber: mcrosier. · View Herald TranscriptApr 3 2016, 10:53 PM

Any particular reason this is being done in the MachineCombiner pass? My first thought would be to do this as a DAG combine..

Reimplemented in DAG Combine.

mcrosier added inline comments.Apr 6 2016, 3:18 PM

lib/Target/AArch64/AArch64ISelLowering.cpp
8292 ↗	(On Diff #52847)	I would further narrow this such that we only commute the operands when we know the shift is 3 or fewer. Otherwise, code will be perturbed for no real reason.

Restrict further.

t.p.northover added a subscriber: t.p.northover.Apr 8 2016, 11:52 AM

t.p.northover added inline comments.

lib/Target/AArch64/AArch64ISelLowering.cpp
8292 ↗	(On Diff #52884)	I'd be very tempted to turn this on for all CPUs (perhaps with a comment that Kryo is particularly keen on it). The optimal checks would be slightly different for others, but it's already a transformation that allows the formation of extra "ldr Dst, [xA, xB, lsl #N]" instructions for everyone.

mcrosier added inline comments.Apr 8 2016, 2:30 PM

lib/Target/AArch64/AArch64ISelLowering.cpp
8292 ↗	(On Diff #52884)	I very much agree with you, Tim. If the optimization is applicable to all subtargets we should not guard this with isKryo.

Move to D18949 which supports Add/Xor/Or/And, includes llvm-commits for review, and enables this feature for all AArch64 back-ends.

Revision Contents

Path

Size

include/

llvm/

CodeGen/

MachineCombinerPattern.h

5 lines

lib/

Target/

AArch64/

AArch64InstrInfo.cpp

172 lines

test/

CodeGen/

AArch64/

kryo-lsl-addrs.ll

55 lines

Diff 52521

include/llvm/CodeGen/MachineCombinerPattern.h

Show All 32 Lines	enum class MachineCombinerPattern {
MULSUBW_OP2,		MULSUBW_OP2,
MULADDWI_OP1,		MULADDWI_OP1,
MULSUBWI_OP1,		MULSUBWI_OP1,
MULADDX_OP1,		MULADDX_OP1,
MULADDX_OP2,		MULADDX_OP2,
MULSUBX_OP1,		MULSUBX_OP1,
MULSUBX_OP2,		MULSUBX_OP2,
MULADDXI_OP1,		MULADDXI_OP1,
MULSUBXI_OP1		MULSUBXI_OP1,

		KRYO_LSL_ADDWS,
		KRYO_LSL_ADDXS,
};		};

} // end namespace llvm		} // end namespace llvm

#endif		#endif

lib/Target/AArch64/AArch64InstrInfo.cpp

Show First 20 Lines • Show All 2,650 Lines • ▼ Show 20 Lines	bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
case AArch64::FMULv2f64:		case AArch64::FMULv2f64:
case AArch64::FMULv4f32:		case AArch64::FMULv4f32:
return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;		return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
default:		default:
return false;		return false;
}		}
}		}

		static bool isCannonicalizeCandidate(MachineBasicBlock &MBB, MachineOperand &MO,
		unsigned LSLOpc, unsigned &ShiftVal) {
		MachineInstr *MI = nullptr;
		MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

		// We need a virtual register definition.
		if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
		MI = MRI.getUniqueVRegDef(MO.getReg());
		if (!MI \|\| MI->getParent() != &MBB \|\| (unsigned)MI->getOpcode() != LSLOpc)
		return false;
		// Must only used by the user we cannonicalize with.
		if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
		return false;

		const MachineOperand &Op2 = MI->getOperand(2);
		const MachineOperand &Op3 = MI->getOperand(3);

		if (!Op2.isImm() \|\| !Op3.isImm())
		return false;

		int64_t immr = Op2.getImm();
		int64_t imms = Op3.getImm();
		if (LSLOpc == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr))
		ShiftVal = 31 - imms;
		else if (LSLOpc == AArch64::UBFMXri && imms != 0x3F && ((imms + 1 == immr)))
		ShiftVal = 63 - imms;
		else
		return false;

		return true;
		}

		// In Kryo if an commutative instruction has a LSL for both operands and if the
		// LSL can be folded into the instruction's shifted register (e.g., add x0, x1,
		// x2, lsl #3) then we should canonicalize the operands so the smaller (in terms
		// of the number of shifts) is the operands that is folded.
		//
		// For example, rather than
		//
		// lsl x1, x1, #1
		// add x0, x1, x2, lsl #4
		//
		// we should prefer
		//
		// lsl x2, x2, #4
		// add x0, x2, x1, lsl #1
		//
		// as this safes a cycle on the add instruction.
		static bool
		getCannonicalizePatterns(MachineInstr &Root,
		SmallVectorImpl<MachineCombinerPattern> &Patterns) {
		unsigned Opc = Root.getOpcode();
		MachineBasicBlock &MBB = *Root.getParent();
		unsigned LSLOpc;
		unsigned ShiftVal1, ShiftVal2;
		MachineCombinerPattern Pattern;
		switch (Opc) {
		default:
		return false;
		case AArch64::ADDWrs:
		LSLOpc = AArch64::UBFMWri;
		Pattern = MachineCombinerPattern::KRYO_LSL_ADDWS;
		break;
		case AArch64::ADDXrs:
		LSLOpc = AArch64::UBFMXri;
		Pattern = MachineCombinerPattern::KRYO_LSL_ADDXS;
		break;
		}

		if (!isCannonicalizeCandidate(MBB, Root.getOperand(1), LSLOpc, ShiftVal1))
		return false;

		if (!Root.getOperand(3).isImm())
		return false;
		unsigned val = Root.getOperand(3).getImm();
		if (!val \|\| AArch64_AM::getShiftType(val) != AArch64_AM::LSL)
		return false;
		ShiftVal2 = AArch64_AM::getShiftValue(val);

		if (ShiftVal1 >= ShiftVal2)
		return false;

		Patterns.push_back(Pattern);
		return true;
		}

/// Find instructions that can be turned into madd.		/// Find instructions that can be turned into madd.
static bool getMaddPatterns(MachineInstr &Root,		static bool getMaddPatterns(MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns) {		SmallVectorImpl<MachineCombinerPattern> &Patterns) {
unsigned Opc = Root.getOpcode();		unsigned Opc = Root.getOpcode();
MachineBasicBlock &MBB = *Root.getParent();		MachineBasicBlock &MBB = *Root.getParent();
bool Found = false;		bool Found = false;

if (!isCombineInstrCandidate(Opc))		if (!isCombineInstrCandidate(Opc))
▲ Show 20 Lines • Show All 99 Lines • ▼ Show 20 Lines
/// Return true when there is potentially a faster code sequence for an		/// Return true when there is potentially a faster code sequence for an
/// instruction chain ending in \p Root. All potential patterns are listed in		/// instruction chain ending in \p Root. All potential patterns are listed in
/// the \p Pattern vector. Pattern should be sorted in priority order since the		/// the \p Pattern vector. Pattern should be sorted in priority order since the
/// pattern evaluator stops checking as soon as it finds a faster sequence.		/// pattern evaluator stops checking as soon as it finds a faster sequence.

bool AArch64InstrInfo::getMachineCombinerPatterns(		bool AArch64InstrInfo::getMachineCombinerPatterns(
MachineInstr &Root,		MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns) const {		SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
		if (Subtarget.isKryo() && getCannonicalizePatterns(Root, Patterns))
		return true;

if (getMaddPatterns(Root, Patterns))		if (getMaddPatterns(Root, Patterns))
return true;		return true;

return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);		return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
}		}

		// Cannonicalize from
		// B = LSL A, #shift1
		// D = ADD B, C, lsl #shift2
		//
		// to
		// B = LSL C, #shift2
		// D = ADD B, A, lsl #shift1
		static void CannonicalizeOperands(MachineFunction &MF, MachineRegisterInfo &MRI,
		const TargetInstrInfo *TII,
		MachineInstr &Root,
		MachineCombinerPattern Pattern,
		SmallVectorImpl<MachineInstr *> &InsInstrs,
		SmallVectorImpl<MachineInstr *> &DelInstrs) {
		MachineInstr *LSL = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
		const TargetRegisterClass *RC;

		unsigned ShiftVal1, ShiftVal2;
		ShiftVal2 = AArch64_AM::getShiftValue(Root.getOperand(3).getImm());
		unsigned ImmR2;
		int64_t ImmS1 = LSL->getOperand(3).getImm();
		switch (Pattern) {
		case MachineCombinerPattern::KRYO_LSL_ADDWS:
		ShiftVal1 = 31 - ImmS1;
		ImmR2 = 32 - ShiftVal2;
		RC = &AArch64::GPR32RegClass;
		break;
		case MachineCombinerPattern::KRYO_LSL_ADDXS:
		ShiftVal1 = 63 - ImmS1;
		ImmR2 = 64 - ShiftVal2;
		RC = &AArch64::GPR64RegClass;
		break;
		default:
		llvm_unreachable("unexpected MachineCombinerPattern");
		}
		unsigned ImmS2 = ImmR2 - 1;

		MachineOperand &OpA = LSL->getOperand(1);
		MachineOperand &OpB = LSL->getOperand(0);
		MachineOperand &OpC = Root.getOperand(2);
		MachineOperand &OpD = Root.getOperand(0);

		unsigned RegA = OpA.getReg();
		unsigned RegB = OpB.getReg();
		unsigned RegC = OpC.getReg();
		unsigned RegD = OpD.getReg();

		if (TargetRegisterInfo::isVirtualRegister(RegA))
		MRI.constrainRegClass(RegA, RC);
		if (TargetRegisterInfo::isVirtualRegister(RegB))
		MRI.constrainRegClass(RegB, RC);
		if (TargetRegisterInfo::isVirtualRegister(RegC))
		MRI.constrainRegClass(RegC, RC);
		if (TargetRegisterInfo::isVirtualRegister(RegD))
		MRI.constrainRegClass(RegD, RC);

		unsigned AddOpc = Root.getOpcode();
		unsigned LSLOpc = LSL->getOpcode();
		bool KillA = OpA.isKill();
		bool KillB = OpB.isKill();
		bool KillC = OpC.isKill();

		// Create new instructions for insertion.
		MachineInstrBuilder MIB1 =
		BuildMI(MF, LSL->getDebugLoc(), TII->get(LSLOpc), RegB)
		.addReg(RegC, getKillRegState(KillC))
		.addImm(ImmR2)
		.addImm(ImmS2);
		MachineInstrBuilder MIB2 =
		BuildMI(MF, Root.getDebugLoc(), TII->get(AddOpc), RegD)
		.addReg(RegB, getKillRegState(KillB))
		.addReg(RegA, getKillRegState(KillA))
		.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftVal1));

		InsInstrs.push_back(MIB1);
		InsInstrs.push_back(MIB2);
		DelInstrs.push_back(LSL);
		DelInstrs.push_back(&Root);
		}

/// genMadd - Generate madd instruction and combine mul and add.		/// genMadd - Generate madd instruction and combine mul and add.
/// Example:		/// Example:
/// MUL I=A,B,0		/// MUL I=A,B,0
/// ADD R,I,C		/// ADD R,I,C
/// ==> MADD R,A,B,C		/// ==> MADD R,A,B,C
/// \param Root is the ADD instruction		/// \param Root is the ADD instruction
/// \param [out] InsInstrs is a vector of machine instructions and will		/// \param [out] InsInstrs is a vector of machine instructions and will
/// contain the generated madd instruction		/// contain the generated madd instruction
▲ Show 20 Lines • Show All 101 Lines • ▼ Show 20 Lines	void AArch64InstrInfo::genAlternativeCodeSequence(
const TargetRegisterClass *RC;		const TargetRegisterClass *RC;
unsigned Opc;		unsigned Opc;
switch (Pattern) {		switch (Pattern) {
default:		default:
// Reassociate instructions.		// Reassociate instructions.
TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,		TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
DelInstrs, InstrIdxForVirtReg);		DelInstrs, InstrIdxForVirtReg);
return;		return;
		case MachineCombinerPattern::KRYO_LSL_ADDWS:
		case MachineCombinerPattern::KRYO_LSL_ADDXS:
		CannonicalizeOperands(MF, MRI, TII, Root, Pattern, InsInstrs, DelInstrs);
		return;
case MachineCombinerPattern::MULADDW_OP1:		case MachineCombinerPattern::MULADDW_OP1:
case MachineCombinerPattern::MULADDX_OP1:		case MachineCombinerPattern::MULADDX_OP1:
// MUL I=A,B,0		// MUL I=A,B,0
// ADD R,I,C		// ADD R,I,C
// ==> MADD R,A,B,C		// ==> MADD R,A,B,C
// --- Create(MADD);		// --- Create(MADD);
if (Pattern == MachineCombinerPattern::MULADDW_OP1) {		if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
Opc = AArch64::MADDWrrr;		Opc = AArch64::MADDWrrr;
▲ Show 20 Lines • Show All 347 Lines • Show Last 20 Lines

test/CodeGen/AArch64/kryo-lsl-addrs.ll

This file was added.

				; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s \| FileCheck %s

				; Verify that the shift amount in the add instruction is alwarys the smaller
				; one.

				define i32 @lsl_add1(i32 %a, i32 %b) {
				; CHECK-LABEL: lsl_add1:
				; CHECK: lsl w8, w0, #3
				; CHECK-NEXT: add w0, w8, w1, lsl #2
				; CHECK-NEXT: ret
				entry:
				%shl = shl i32 %a, 3
				%shl1 = shl i32 %b, 2
				%add = add i32 %shl1, %shl
				ret i32 %add
				}

				define i32 @lsl_add2(i32 %a, i32 %b) {
				; CHECK-LABEL: lsl_add2:
				; CHECK: lsl w8, w1, #3
				; CHECK-NEXT: add w0, w8, w0, lsl #2
				; CHECK-NEXT: ret

				entry:
				%shl = shl i32 %a, 2
				%shl1 = shl i32 %b, 3
				%add = add i32 %shl1, %shl
				ret i32 %add
				}

				define i64 @lsl_add3(i64 %a, i64 %b) {
				; CHECK-LABEL: lsl_add3:
				; CHECK: lsl x8, x0, #3
				; CHECK-NEXT: add x0, x8, x1, lsl #2
				; CHECK-NEXT: ret

				entry:
				%shl = shl i64 %a, 3
				%shl1 = shl i64 %b, 2
				%add = add i64 %shl1, %shl
				ret i64 %add
				}

				define i64 @lsl_add4(i64 %a, i64 %b) {
				; CHECK-LABEL: lsl_add4:
				; CHECK: lsl x8, x1, #3
				; CHECK-NEXT: add x0, x8, x0, lsl #2
				; CHECK-NEXT: ret
				entry:
				%shl = shl i64 %a, 2
				%shl1 = shl i64 %b, 3
				%add = add i64 %shl1, %shl
				ret i64 %add
				}