This is an archive of the discontinued LLVM Phabricator instance.

[X86] Integer multiplication improvements
Needs ReviewPublic

Authored by mkuper on Feb 1 2015, 6:04 AM.

Download Raw Diff

Details

Reviewers

nadav
delena

Summary

This makes the x86-specific DAG combine for (imul -> shl + lea) more generic, allowing it to handle some cases it could not handle before (e.g. x * -81 or x * 1920).

Diff Detail

Event Timeline

mkuper updated this revision to Diff 19111.Feb 1 2015, 6:04 AM

mkuper retitled this revision from to [X86] Integer multiplication improvements.

mkuper updated this object.

mkuper edited the test plan for this revision. (Show Details)

mkuper added reviewers: nadav, delena.

mkuper added a subscriber: Unknown Object (MLST).

chandlerc added a subscriber: chandlerc.Feb 1 2015, 6:58 AM

chandlerc added inline comments.

lib/Target/X86/X86ISelLowering.cpp
23991–23992	s/in/an
24019–24023	This doesn't really make sense. We shouldn't be hard coding these things here. LEA as slower on Atom than on other X86 chips. IMUL can be anything from 6 to 14 cycles on some X86 chips. We have a schedule, we should consult it for the expected latency of these and use those to drive the limits. (And we should fix the schedule to be correct if it is currently wrong.) Also, I wouldn't use capitals here. This isn't a macro.
24025–24026	Just use the boolean?
24060–24061	It's pretty hostile to the DAG to create nodes unless they are absolutely going to be used. It's almost certainly better to do the math in a tight loop first to verify that we'll actually combine this node. That will also let you use early exit.

Thanks, Chandler!

lib/Target/X86/X86ISelLowering.cpp
24019–24023	Right, didn't think about Atom, I'll check whether it's worthwhile there, thanks! The "4" is just a conservative estimate - precisely because it can be anything in a fairly large range (can be below 6 - starts at 4 on HSW). I don't think we model that anywhere, so taking a number from the schedule didn't look like the right thing to do. In any case, I'll take a look at what the schedule currently is, and whether it makes sense.
24025–24026	I always thought adding a boolean looks odd, but sure.
24060–24061	Got it.

Revision Contents

Path

Size

lib/

Target/

X86/

	X86ISelLowering.cpp
	X86ISelLowering.cpp (revision 227715)

107 lines

test/

CodeGen/

X86/

	imul-lea-2.ll
	imul-lea-2.ll (revision 227715)

19 lines

	imul.ll
	imul.ll (revision 227715)

60 lines

Diff 19111

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 23,965 Lines • ▼ Show 20 Lines	case Intrinsic::x86_avx2_psra_d: {
// Replace this packed shift intrinsic with a target independent		// Replace this packed shift intrinsic with a target independent
// shift dag node.		// shift dag node.
SDValue Splat = DAG.getConstant(C, VT);		SDValue Splat = DAG.getConstant(C, VT);
return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);		return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
}		}
}		}
}		}

		static int getSmallDivisor(uint64_t Val) {
		// Note that order is important - 9 must come before 3.
		if (Val % 9 == 0)
		return 9;
		if (Val % 5 == 0)
		return 5;
		if (Val % 3 == 0)
		return 3;
		return 0;
		}

/// PerformMulCombine - Optimize a single multiply with constant into two		/// PerformMulCombine - Optimize a single multiply with constant into two
/// in order to implement it with two cheaper instructions, e.g.		/// in order to implement it with two cheaper instructions, e.g.
/// LEA + SHL, LEA + LEA.		/// LEA + SHL, LEA + LEA.
static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,		static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {		TargetLowering::DAGCombinerInfo &DCI) {

		// The sequences we want to produce are larger than in imul,
		// disable this for -Oz
		chandlercUnsubmitted Not Done Reply Inline Actions s/in/an chandlerc: s/in/an
		if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
		AttributeSet::FunctionIndex, Attribute::MinSize))
		return SDValue();

if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())		if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
return SDValue();		return SDValue();

EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
if (VT != MVT::i64 && VT != MVT::i32)		if (VT != MVT::i64 && VT != MVT::i32)
return SDValue();		return SDValue();

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));		ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!C)		if (!C)
return SDValue();		return SDValue();
uint64_t MulAmt = C->getZExtValue();
if (isPowerOf2_64(MulAmt) \|\| MulAmt == 3 \|\| MulAmt == 5 \|\| MulAmt == 9)		bool IsNeg = C->getSExtValue() < 0;
		uint64_t MulAmt = IsNeg ? -C->getSExtValue() : C->getSExtValue();

		// This is handled by the target-independent DAGCombine
		if (!MulAmt \|\| isPowerOf2_64(MulAmt))
return SDValue();		return SDValue();

uint64_t MulAmt1 = 0;		unsigned ShiftAmt = countTrailingZeros(MulAmt);
uint64_t MulAmt2 = 0;		if (ShiftAmt)
if ((MulAmt % 9) == 0) {		MulAmt >>= ShiftAmt;
MulAmt1 = 9;
MulAmt2 = MulAmt / 9;		// How many steps will we have to peform to replace the
} else if ((MulAmt % 5) == 0) {		// MUL. We limit this to 3 steps, based on imul latency of 4
MulAmt1 = 5;		// (If the latency is equal, this is probably not a win)
MulAmt2 = MulAmt / 5;		unsigned Steps = 0;
} else if ((MulAmt % 3) == 0) {		const unsigned MAX_STEPS = 3;
		chandlercUnsubmitted Not Done Reply Inline Actions This doesn't really make sense. We shouldn't be hard coding these things here. LEA as slower on Atom than on other X86 chips. IMUL can be anything from 6 to 14 cycles on some X86 chips. We have a schedule, we should consult it for the expected latency of these and use those to drive the limits. (And we should fix the schedule to be correct if it is currently wrong.) Also, I wouldn't use capitals here. This isn't a macro. chandlerc: This doesn't really make sense. We shouldn't be hard coding these things here. - LEA as slower…
		mkuperAuthorUnsubmitted Not Done Reply Inline Actions Right, didn't think about Atom, I'll check whether it's worthwhile there, thanks! The "4" is just a conservative estimate - precisely because it can be anything in a fairly large range (can be below 6 - starts at 4 on HSW). I don't think we model that anywhere, so taking a number from the schedule didn't look like the right thing to do. In any case, I'll take a look at what the schedule currently is, and whether it makes sense. mkuper: Right, didn't think about Atom, I'll check whether it's worthwhile there, thanks! The "4" is…
MulAmt1 = 3;		// Having to negate or shift is a step
MulAmt2 = MulAmt / 3;		Steps += (IsNeg ? 1 : 0);
}		Steps += (ShiftAmt ? 1 : 0);
		chandlercUnsubmitted Not Done Reply Inline Actions Just use the boolean? chandlerc: Just use the boolean?
		mkuperAuthorUnsubmitted Not Done Reply Inline Actions I always thought adding a boolean looks odd, but sure. mkuper: I always thought adding a boolean looks odd, but sure.
if (MulAmt2 &&
(isPowerOf2_64(MulAmt2) \|\| MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)){		// Finalize multiplications by interegers whose only factors are in
		// {3, 5, 9, 2 ^ C}
		SDValue NewMul = N->getOperand(0);
SDLoc DL(N);		SDLoc DL(N);

if (isPowerOf2_64(MulAmt2) &&		// First, we shift, then we multiply by the small constant, with the
!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))		// expectation that the small constant multiplication will become a LEA.
// If second multiplifer is pow2, issue it first. We want the multiply by		// But if the original mul has one use, and that use is an ADD, then we
// 3, 5, or 9 to be folded into the addressing mode unless the lone use		// may get better code from the opposite order, since that ADD may be
// is an add.		// part of an addressing mode calculation.
std::swap(MulAmt1, MulAmt2);		bool MulFirst = N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD;

SDValue NewMul;
if (isPowerOf2_64(MulAmt1))
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
else
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
DAG.getConstant(MulAmt1, VT));

if (isPowerOf2_64(MulAmt2))		if (ShiftAmt && !MulFirst)
NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,		NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
DAG.getConstant(Log2_64(MulAmt2), MVT::i8));		DAG.getConstant(ShiftAmt, MVT::i8));
else
		unsigned Val = getSmallDivisor(MulAmt);
		for (; Val && (Steps < MAX_STEPS) && (MulAmt != 1); ++Steps) {
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,		NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
DAG.getConstant(MulAmt2, VT));		DAG.getConstant(Val, VT));
		MulAmt /= Val;
		Val = getSmallDivisor(MulAmt);
		}

		if (ShiftAmt && MulFirst)
		NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
		DAG.getConstant(ShiftAmt, MVT::i8));

		if (IsNeg)
		NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), NewMul);

// Do not add new nodes to DAG combiner worklist.		// Do not add new nodes to DAG combiner worklist.
		if (MulAmt == 1)
DCI.CombineTo(N, NewMul, false);		DCI.CombineTo(N, NewMul, false);
		chandlercUnsubmitted Not Done Reply Inline Actions It's pretty hostile to the DAG to create nodes unless they are absolutely going to be used. It's almost certainly better to do the math in a tight loop first to verify that we'll actually combine this node. That will also let you use early exit. chandlerc: It's pretty hostile to the DAG to create nodes unless they are absolutely going to be used.
		mkuperAuthorUnsubmitted Not Done Reply Inline Actions Got it. mkuper: Got it.
}
return SDValue();		return SDValue();
}		}

static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {		static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);		ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
EVT VT = N0.getValueType();		EVT VT = N0.getValueType();
▲ Show 20 Lines • Show All 2,740 Lines • Show Last 20 Lines

test/CodeGen/X86/imul-lea-2.ll

	; RUN: llc < %s -march=x86-64 \| FileCheck %s

	; CHECK-NOT: imul

	define i64 @t1(i64 %a) nounwind readnone {
	entry:
	%0 = mul i64 %a, 81
	; CHECK: lea
	; CHECK: lea
	ret i64 %0
	}

	define i64 @t2(i64 %a) nounwind readnone {
	entry:
	%0 = mul i64 %a, 40
	; CHECK: shl
	; CHECK: lea
	ret i64 %0
	}

test/CodeGen/X86/imul.ll

Show First 20 Lines • Show All 63 Lines • ▼ Show 20 Lines	; X86-NEXT: sbbl
%mul = mul i64 %A, -4096		%mul = mul i64 %A, -4096
ret i64 %mul		ret i64 %mul
}		}

define i32 @mul3_32(i32 %A) {		define i32 @mul3_32(i32 %A) {
; X64-LABEL: mul3_32:		; X64-LABEL: mul3_32:
; X64: leal		; X64: leal
; X86-LABEL: mul3_32:		; X86-LABEL: mul3_32:
; But why?!		; X86: leal
; X86: imull
%mul = mul i32 %A, 3		%mul = mul i32 %A, 3
ret i32 %mul		ret i32 %mul
}		}

define i64 @mul3_64(i64 %A) {		define i64 @mul3_64(i64 %A) {
; X64-LABEL: mul3_64:		; X64-LABEL: mul3_64:
; X64: leaq		; X64: leaq
; X86-LABEL: mul3_64:		; X86-LABEL: mul3_64:
; X86: mull		; X86: leal
; X86-NEXT: imull		; X86-NEXT: movl
		; X86-NEXT: mull
		; X86-NEXT: addl
%mul = mul i64 %A, 3		%mul = mul i64 %A, 3
ret i64 %mul		ret i64 %mul
}		}

define i32 @mul40_32(i32 %A) {		define i32 @mul40_32(i32 %A) {
; X64-LABEL: mul40_32:		; X64-LABEL: mul40_32:
; X64: shll		; X64: shll
; X64-NEXT: leal		; X64-NEXT: leal
Show All 11 Lines
; X86-LABEL: mul40_64:		; X86-LABEL: mul40_64:
; X86: leal		; X86: leal
; X86-NEXT: movl		; X86-NEXT: movl
; X86-NEXT: mull		; X86-NEXT: mull
; X86-NEXT: leal		; X86-NEXT: leal
%mul = mul i64 %A, 40		%mul = mul i64 %A, 40
ret i64 %mul		ret i64 %mul
}		}

		define i32 @mul81_32(i32 %A) {
		; X64-LABEL: mul81_32:
		; X64: leal
		; X64-NEXT: leal
		; X86-LABEL: mul81_32:
		; X86: leal
		; X86-NEXT: leal
		%mul = mul i32 %A, 81
		ret i32 %mul
		}

		define i32 @mulmin81_32(i32 %A) {
		; X64-LABEL: mulmin81_32:
		; X64: leal
		; X64-NEXT: leal
		; X64-NEXT: negl
		; X86-LABEL: mulmin81_32:
		; X86: leal
		; X86-NEXT: leal
		; X86-NEXT: negl
		%mul = mul i32 %A, -81
		ret i32 %mul
		}

		define i32 @mul1920_32(i32 %A) {
		; X64-LABEL: mul1920_32:
		; X64: shll
		; X64-NEXT: leal
		; X64-NEXT: leal
		; X86-LABEL: mul1920_32:
		; X86: shll
		; X86-NEXT: leal
		; X86-NEXT: leal
		%mul = mul i32 %A, 1920
		ret i32 %mul
		}

		; These muls require too many instruction
		define i32 @negative(i32 %A) {
		; X64-LABEL: negative:
		; X64: imull $-1920
		; X64: imull $625
		; X86-LABEL: negative:
		; X86: imull $-1920
		; X86: imull $625
		%mul = mul i32 %A, -1920
		%mul2 = mul i32 %A, 625
		%f = add i32 %mul, %mul2
		ret i32 %f
		}
		No newline at end of file