This is an archive of the discontinued LLVM Phabricator instance.

; RUN: llc -mtriple=aarch64-apple-darwin            -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FAST

And pre-generate the check lines with update_llc_test_checks.

sdesmalen added inline comments.Oct 18 2021, 1:05 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
12896	I'd suggest collapsing all conditions together into a single lambda, so that it makes the code structurally simpler (less control flow) and more readable (you can directly see the exact pattern being matched). It may also help handle the case where the N0 is an ADD/SUB which does not match the conditions that follow, where N1 is just `Y + 1`. e.g. (X+Z)*(Y+1), isn't yet handled by your code. auto IsAddSubWith1= [](SDValue V) -> bool { unsigned Opc = V->getOpcode(); if ((Opc == ISD::ADD \|\| Opc == ISD::SUB) ) && V->hasOneUse()) { SDValue Opnd = Opc == ISD::ADD ? V->getOperand(1) : V->getOperand(0); if (auto C = dyn_cast<ConstantSDNode>(Opnd)) return C->isOne(); } return false; } if (IsAddSubWith1(N->getOperand(0)) { // Rewrite } if (IsAddSubWith1(N->getOperand(1))) { // Rewrite }
12923	nit: this one can be replaced by N1.
12930	nit: this one can be replaced by N0.

DavidSpickett edited reviewers, added: DavidSpickett; removed: ARMDavidSpickett.Oct 18 2021, 1:18 AM

wwei updated this revision to Diff 380878.Oct 20 2021, 2:15 AM

wwei edited reviewers, added: david-arm; removed: DavidSpickett.

wwei added inline comments.Oct 20 2021, 2:19 AM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
12896	Thank you for your suggestion! I have refactored the code.
12923	done
12930	This one can't be replaced
llvm/test/CodeGen/AArch64/madd-combiner.ll
1–3	updated, thanks

Harbormaster completed remote builds in B129679: Diff 380878.Oct 20 2021, 2:57 AM

Thanks for the changes!

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

12886

nit: unnecessary curly braces.

12896

If you change IsAddSubWith1 also return the other operand, the code for rewriting becomes a bit simpler and easier to follow.

auto IsAddSubWith1 = [](SDValue V, SDValue &OtherOpnd) -> bool {
  ...
};

SDValue OtherOpnd;                                                
if (IsAddSubWith1(N0, OtherOpnd)) {
  SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, OtherOpnd);
  return DAG.getNode(N0->getOpcode(), DL, VT, N1, MulVal);
}

Hi @wwei, have you done any performance measurements with this change on hardware to see what effect it has on some benchmarks?

wwei updated this revision to Diff 381888.Oct 25 2021, 2:05 AM

wwei added inline comments.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
12886	Braces removed
12896	Done.

In D111862#3075475, @david-arm wrote:

Hi @wwei, have you done any performance measurements with this change on hardware to see what effect it has on some benchmarks?

We found this optimization opportunity on some HPC workloads, but we haven’t tested the performance gain yet.

Harbormaster completed remote builds in B130389: Diff 381888.Oct 25 2021, 3:22 AM

LGTM! Looks like you've addressed all the review comments.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
12882	nit: Could you rename this so it's more obvious, i.e. something like `AddSubOpc`?

This revision is now accepted and ready to land.Nov 2 2021, 4:45 AM

Closed by commit rGbf3784b882c4: [AArch64] Canonicalize X*(Y+1) or X*(1-Y) to madd/msub (authored by wwei). · Explain WhyNov 8 2021, 12:53 AM

This revision was automatically updated to reflect the committed changes.

wwei added a commit: rGbf3784b882c4: [AArch64] Canonicalize X*(Y+1) or X*(1-Y) to madd/msub.

Should we add ISD::Freeze here? What if X/Y is undef for this pattern?
It looks the transform is not correct unless X/Y is noundef on LLVM IR. I don't know if SDAG have similar limitation or not.
https://alive2.llvm.org/ce/z/ciKKWy

Herald added a project: Restricted Project. · View Herald TranscriptFeb 16 2023, 12:53 AM

The pattern isn't undef-safe, but it's poison-safe. We haven't been going after transforms like that very aggressively lately, given https://discourse.llvm.org/t/rfc-load-instruction-uninitialized-memory-semantics/67481 . But yes, it should freeze the operand.

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

48 lines

test/

CodeGen/

AArch64/

madd-combiner.ll

48 lines

Diff 379904

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 12,865 Lines • ▼ Show 20 Lines	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {		const AArch64Subtarget *Subtarget) {

if (SDValue Ext = performMulVectorExtendCombine(N, DAG))		if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
return Ext;		return Ext;

if (DCI.isBeforeLegalizeOps())		if (DCI.isBeforeLegalizeOps())
return SDValue();		return SDValue();

		// Canonicalize X(Y+1) -> XY+X and (X+1)Y -> XY+Y,
		// and in MachineCombiner pass, add+mul will be combined into madd.
		// Similarly, X(1-Y) -> X - XY and (1-Y)X -> X - YX.
		SDLoc DL(N);
		EVT VT = N->getValueType(0);
		SDValue N0 = N->getOperand(0);
		SDValue N1 = N->getOperand(1);
		SDValue Op0;
		SDValue Op1;
		david-armUnsubmitted Not Done Reply Inline Actions nit: Could you rename this so it's more obvious, i.e. something like `AddSubOpc`? david-arm: nit: Could you rename this so it's more obvious, i.e. something like `AddSubOpc`?

		if (!isa<ConstantSDNode>(N0) && !isa<ConstantSDNode>(N1)) {
		if (N0->getOpcode() == ISD::ADD \|\| N0->getOpcode() == ISD::SUB) {
		Op0 = N0;
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: unnecessary curly braces. sdesmalen: nit: unnecessary curly braces.
		wweiAuthorUnsubmitted Done Reply Inline Actions Braces removed wwei: Braces removed
		Op1 = N1;
		} else if (N1->getOpcode() == ISD::ADD \|\| N1->getOpcode() == ISD::SUB) {
		Op0 = N1;
		Op1 = N0;
		}

		if (!Op0 \|\| !Op0->hasOneUse())
		return SDValue();

		if (Op0->getOpcode() == ISD::ADD &&
		sdesmalenUnsubmitted Not Done Reply Inline Actions I'd suggest collapsing all conditions together into a single lambda, so that it makes the code structurally simpler (less control flow) and more readable (you can directly see the exact pattern being matched). It may also help handle the case where the N0 is an ADD/SUB which does not match the conditions that follow, where N1 is just `Y + 1`. e.g. (X+Z)(Y+1), isn't yet handled by your code. auto IsAddSubWith1= [](SDValue V) -> bool { unsigned Opc = V->getOpcode(); if ((Opc == ISD::ADD \|\| Opc == ISD::SUB) ) && V->hasOneUse()) { SDValue Opnd = Opc == ISD::ADD ? V->getOperand(1) : V->getOperand(0); if (auto C = dyn_cast<ConstantSDNode>(Opnd)) return C->isOne(); } return false; } if (IsAddSubWith1(N->getOperand(0)) { // Rewrite } if (IsAddSubWith1(N->getOperand(1))) { // Rewrite } sdesmalen:* I'd suggest collapsing all conditions together into a single lambda, so that it makes the code…
		wweiAuthorUnsubmitted Done Reply Inline Actions Thank you for your suggestion! I have refactored the code. wwei: Thank you for your suggestion! I have refactored the code.
		sdesmalenUnsubmitted Not Done Reply Inline Actions If you change `IsAddSubWith1` also return the other operand, the code for rewriting becomes a bit simpler and easier to follow. auto IsAddSubWith1 = [](SDValue V, SDValue &OtherOpnd) -> bool { ... }; SDValue OtherOpnd; if (IsAddSubWith1(N0, OtherOpnd)) { SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, OtherOpnd); return DAG.getNode(N0->getOpcode(), DL, VT, N1, MulVal); } sdesmalen: If you change `IsAddSubWith1` also return the other operand, the code for rewriting becomes a…
		wweiAuthorUnsubmitted Done Reply Inline Actions Done. wwei: Done.
		isa<ConstantSDNode>(Op0->getOperand(1))) {
		ConstantSDNode *C = cast<ConstantSDNode>(Op0->getOperand(1));
		if (C && C->isOne()) {
		SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, Op1, Op0->getOperand(0));
		SDValue Res = DAG.getNode(ISD::ADD, DL, VT, MulVal, Op1);
		return Res;
		}
		}

		if (Op0->getOpcode() == ISD::SUB &&
		isa<ConstantSDNode>(Op0->getOperand(0))) {
		ConstantSDNode *C = cast<ConstantSDNode>(Op0->getOperand(0));
		if (C && C->isOne()) {
		SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, Op1, Op0->getOperand(1));
		SDValue Res = DAG.getNode(ISD::SUB, DL, VT, Op1, MulVal);
		return Res;
		}
		}

		return SDValue();
		}

// The below optimizations require a constant RHS.		// The below optimizations require a constant RHS.
if (!isa<ConstantSDNode>(N->getOperand(1)))		if (!isa<ConstantSDNode>(N->getOperand(1)))
return SDValue();		return SDValue();

SDValue N0 = N->getOperand(0);
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));		ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: this one can be replaced by N1. sdesmalen: nit: this one can be replaced by N1.
		wweiAuthorUnsubmitted Done Reply Inline Actions done wwei: done
const APInt &ConstValue = C->getAPIntValue();		const APInt &ConstValue = C->getAPIntValue();

// Allow the scaling to be folded into the `cnt` instruction by preventing		// Allow the scaling to be folded into the `cnt` instruction by preventing
// the scaling to be obscured here. This makes it easier to pattern match.		// the scaling to be obscured here. This makes it easier to pattern match.
if (IsSVECntIntrinsic(N0) \|\|		if (IsSVECntIntrinsic(N0) \|\|
(N0->getOpcode() == ISD::TRUNCATE &&		(N0->getOpcode() == ISD::TRUNCATE &&
(IsSVECntIntrinsic(N0->getOperand(0)))))		(IsSVECntIntrinsic(N0->getOperand(0)))))
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: this one can be replaced by N0. sdesmalen: nit: this one can be replaced by N0.
		wweiAuthorUnsubmitted Done Reply Inline Actions This one can't be replaced wwei: This one can't be replaced
if (ConstValue.sge(1) && ConstValue.sle(16))		if (ConstValue.sge(1) && ConstValue.sle(16))
return SDValue();		return SDValue();

// Multiplication of a power of two plus/minus one can be done more		// Multiplication of a power of two plus/minus one can be done more
// cheaply as as shift+add/sub. For now, this is true unilaterally. If		// cheaply as as shift+add/sub. For now, this is true unilaterally. If
// future CPUs have a cheaper MADD instruction, this may need to be		// future CPUs have a cheaper MADD instruction, this may need to be
// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and		// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
// 64-bit is 5 cycles, so this is always a win.		// 64-bit is 5 cycles, so this is always a win.
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	if (ConstValue.isNonNegative()) {
} else if (CVNegMinus1.isPowerOf2()) {		} else if (CVNegMinus1.isPowerOf2()) {
ShiftAmt = CVNegMinus1.logBase2();		ShiftAmt = CVNegMinus1.logBase2();
AddSubOpc = ISD::ADD;		AddSubOpc = ISD::ADD;
NegateResult = true;		NegateResult = true;
} else		} else
return SDValue();		return SDValue();
}		}

SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,		SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
DAG.getConstant(ShiftAmt, DL, MVT::i64));		DAG.getConstant(ShiftAmt, DL, MVT::i64));

SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;		SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;		SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);		SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
assert(!(NegateResult && TrailingZeroes) &&		assert(!(NegateResult && TrailingZeroes) &&
"NegateResult and TrailingZeroes cannot both be true for now.");		"NegateResult and TrailingZeroes cannot both be true for now.");
▲ Show 20 Lines • Show All 6,105 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/madd-combiner.ll

; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s \| FileCheck %s		; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s \| FileCheck %s
; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s \| FileCheck %s		; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s \| FileCheck %s
		; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s \| FileCheck %s -check-prefixes=CHECK-MADD-MSUB
		dmgreenUnsubmitted Not Done Reply Inline Actions Can you change these check lines to: ; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-ISEL ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-FAST And pre-generate the check lines with update_llc_test_checks. dmgreen: Can you change these check lines to: ``` ; RUN: llc -mtriple=aarch64-apple-darwin…
		wweiAuthorUnsubmitted Done Reply Inline Actions updated, thanks wwei: updated, thanks

; Test that we use the correct register class.		; Test that we use the correct register class.
define i32 @mul_add_imm(i32 %a, i32 %b) {		define i32 @mul_add_imm(i32 %a, i32 %b) {
; CHECK-LABEL: mul_add_imm		; CHECK-LABEL: mul_add_imm
; CHECK: orr [[REG:w[0-9]+]], wzr, #0x4		; CHECK: orr [[REG:w[0-9]+]], wzr, #0x4
; CHECK-NEXT: madd {{w[0-9]+}}, w0, w1, [[REG]]		; CHECK-NEXT: madd {{w[0-9]+}}, w0, w1, [[REG]]
%1 = mul i32 %a, %b		%1 = mul i32 %a, %b
%2 = add i32 %1, 4		%2 = add i32 %1, 4
Show All 19 Lines	for.body8:
%0 = mul i64 undef, -3		%0 = mul i64 undef, -3
%mul1971 = add i64 %0, -3		%mul1971 = add i64 %0, -3
%cmp7 = icmp slt i64 %mul1971, 1390451930000		%cmp7 = icmp slt i64 %mul1971, 1390451930000
br i1 %cmp7, label %for.body8, label %for.end20		br i1 %cmp7, label %for.body8, label %for.end20
for.end20:		for.end20:
ret void		ret void
}		}

		define i32 @add1_mul_val1(i32 %a, i32 %b) {
		; CHECK-LABEL: add1_mul_val1
		; CHECK-MADD-MSUB: madd w0, w1, w0, w1
		%1 = add i32 %a, 1
		%2 = mul i32 %1, %b
		ret i32 %2
		}

		define i32 @add1_mul_val2(i32 %a, i32 %b) {
		; CHECK-LABEL: add1_mul_val2
		; CHECK-MADD-MSUB: madd w0, w0, w1, w0
		%1 = add i32 %b, 1
		%2 = mul i32 %a, %1
		ret i32 %2
		}

		define i64 @add1_mul_val3(i64 %a, i64 %b) {
		; CHECK-LABEL: add1_mul_val3
		; CHECK-MADD-MSUB: madd x0, x0, x1, x0
		%1 = add i64 %b, 1
		%2 = mul i64 %a, %1
		ret i64 %2
		}

		define i32 @sub1_mul_val1(i32 %a, i32 %b) {
		; CHECK-LABEL: sub1_mul_val1
		; CHECK-MADD-MSUB: msub w0, w1, w0, w1
		%1 = sub i32 1, %a
		%2 = mul i32 %1, %b
		ret i32 %2
		}

		define i32 @sub1_mul_val2(i32 %a, i32 %b) {
		; CHECK-LABEL: sub1_mul_val2
		; CHECK-MADD-MSUB: msub w0, w0, w1, w0
		%1 = sub i32 1, %b
		%2 = mul i32 %a, %1
		ret i32 %2
		}

		define i64 @sub1_mul_val3(i64 %a, i64 %b) {
		; CHECK-LABEL: sub1_mul_val3
		; CHECK-MADD-MSUB: msub x0, x0, x1, x0
		%1 = sub i64 1, %b
		%2 = mul i64 %a, %1
		ret i64 %2
		}

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Canonicalize X*(Y+1) or X*(1-Y) to madd/msubClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 379904

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/madd-combiner.ll

[AArch64] Canonicalize X(Y+1) or X(1-Y) to madd/msub
ClosedPublic