This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/CodeGen/SelectionDAG/
-
CodeGen/
-
SelectionDAG/
-
TargetLowering.cpp
-
test/CodeGen/
-
CodeGen/
-
AArch64/
-
aarch64-dup-ext.ll
-
mul-cmp.ll
-
X86/
1
mul-cmp.ll

Differential D141086

[SDAG] try to avoid multiply for X*Y==0
ClosedPublic

Authored by spatel on Jan 5 2023, 1:22 PM.

Download Raw Diff

Details

Reviewers

dmgreen
RKSimon
goldstein.w.n
nikic

Commits

rGbf82070ea465: [SDAG] try to avoid multiply for X*Y==0

Summary

Forking this off from D140850 -
https://alive2.llvm.org/ce/z/TgBeK_
https://alive2.llvm.org/ce/z/STVD7d

We could almost justify doing this in IR, but consideration for minsize compiles requires that we only try it in codegen -- the transform is not reversible.

In all other cases, avoiding multiply should be a win because a mul is more expensive than simple/parallelizable compares. AArch even has a trick (assuming that's the correct asm) to keep instruction count even for some types.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

spatel created this revision.Jan 5 2023, 1:22 PM

Herald added a project: Restricted Project. · View Herald TranscriptJan 5 2023, 1:22 PM

Herald added subscribers: pengfei, hiraditya, mcrosier. · View Herald Transcript

spatel requested review of this revision.Jan 5 2023, 1:22 PM

Herald added a project: Restricted Project. · View Herald TranscriptJan 5 2023, 1:22 PM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B205981: Diff 486655.Jan 5 2023, 1:23 PM

goldstein.w.n added inline comments.Jan 5 2023, 2:02 PM

llvm/test/CodeGen/X86/mul-cmp.ll
111	Wow nice!

The proof does not match the transform you are making (https://alive2.llvm.org/ce/z/TgBeK_, aka drop noundef)

LGTM

This revision is now accepted and ready to land.Jan 6 2023, 1:14 AM

Sounds OK. Thanks

I think we can save an extra instruction on the i8/i16 cases that use a tst if we fold an and/or into a csel. I'll see if I can put together a patch.

spatel edited the summary of this revision. (Show Details)Jan 6 2023, 4:20 AM

spatel edited the summary of this revision. (Show Details)

In D141086#4030021, @lebedev.ri wrote:

The proof does not match the transform you are making (https://alive2.llvm.org/ce/z/TgBeK_, aka drop noundef)

Thanks - updated links in the description. I had adapted an example with i32 and that was timing out, so I tacked on noundef, but forgot to adjust it later.

This revision was landed with ongoing or failed builds.Jan 6 2023, 6:07 AM

Closed by commit rGbf82070ea465: [SDAG] try to avoid multiply for X*Y==0 (authored by spatel). · Explain Why

This revision was automatically updated to reflect the committed changes.

spatel added a commit: rGbf82070ea465: [SDAG] try to avoid multiply for X*Y==0.

spatel mentioned this in D141119: [AArch64] Fold And/Or into CSel if possible.Jan 6 2023, 6:44 AM

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

SelectionDAG/

TargetLowering.cpp

20 lines

test/

CodeGen/

AArch64/

aarch64-dup-ext.ll

10 lines

mul-cmp.ll

40 lines

X86/

mul-cmp.ll

93 lines

Diff 486850

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,201 Lines • ▼ Show 20 Lines
/// unable to simplify it, return a null SDValue.		/// unable to simplify it, return a null SDValue.
SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,		SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
ISD::CondCode Cond, bool foldBooleans,		ISD::CondCode Cond, bool foldBooleans,
DAGCombinerInfo &DCI,		DAGCombinerInfo &DCI,
const SDLoc &dl) const {		const SDLoc &dl) const {
SelectionDAG &DAG = DCI.DAG;		SelectionDAG &DAG = DCI.DAG;
const DataLayout &Layout = DAG.getDataLayout();		const DataLayout &Layout = DAG.getDataLayout();
EVT OpVT = N0.getValueType();		EVT OpVT = N0.getValueType();
		AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

// Constant fold or commute setcc.		// Constant fold or commute setcc.
if (SDValue Fold = DAG.FoldSetCC(VT, N0, N1, Cond, dl))		if (SDValue Fold = DAG.FoldSetCC(VT, N0, N1, Cond, dl))
return Fold;		return Fold;

bool N0ConstOrSplat =		bool N0ConstOrSplat =
isConstOrConstSplat(N0, /AllowUndefs/ false, /AllowTruncate/ true);		isConstOrConstSplat(N0, /AllowUndefs/ false, /AllowTruncate/ true);
bool N1ConstOrSplat =		bool N1ConstOrSplat =
Show All 28 Lines	SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,

if (auto *N1C = isConstOrConstSplat(N1)) {		if (auto *N1C = isConstOrConstSplat(N1)) {
const APInt &C1 = N1C->getAPIntValue();		const APInt &C1 = N1C->getAPIntValue();

// Optimize some CTPOP cases.		// Optimize some CTPOP cases.
if (SDValue V = simplifySetCCWithCTPOP(*this, VT, N0, C1, Cond, dl, DAG))		if (SDValue V = simplifySetCCWithCTPOP(*this, VT, N0, C1, Cond, dl, DAG))
return V;		return V;

		// For equality to 0 of a no-wrap multiply, decompose and test each op:
		// X * Y == 0 --> (X == 0) \|\| (Y == 0)
		// X * Y != 0 --> (X != 0) && (Y != 0)
		// TODO: This bails out if minsize is set, but if the target doesn't have a
		// single instruction multiply for this type, it would likely be
		// smaller to decompose.
		if (C1.isZero() && (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
		N0.getOpcode() == ISD::MUL && N0.hasOneUse() &&
		(N0->getFlags().hasNoUnsignedWrap() \|\|
		N0->getFlags().hasNoSignedWrap()) &&
		!Attr.hasFnAttr(Attribute::MinSize)) {
		SDValue IsXZero = DAG.getSetCC(dl, VT, N0.getOperand(0), N1, Cond);
		SDValue IsYZero = DAG.getSetCC(dl, VT, N0.getOperand(1), N1, Cond);
		unsigned LogicOp = Cond == ISD::SETEQ ? ISD::OR : ISD::AND;
		return DAG.getNode(LogicOp, dl, VT, IsXZero, IsYZero);
		}

// If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an		// If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
// equality comparison, then we're just comparing whether X itself is		// equality comparison, then we're just comparing whether X itself is
// zero.		// zero.
if (N0.getOpcode() == ISD::SRL && (C1.isZero() \|\| C1.isOne()) &&		if (N0.getOpcode() == ISD::SRL && (C1.isZero() \|\| C1.isOne()) &&
N0.getOperand(0).getOpcode() == ISD::CTLZ &&		N0.getOperand(0).getOpcode() == ISD::CTLZ &&
isPowerOf2_32(N0.getScalarValueSizeInBits())) {		isPowerOf2_32(N0.getScalarValueSizeInBits())) {
if (ConstantSDNode *ShAmt = isConstOrConstSplat(N0.getOperand(1))) {		if (ConstantSDNode *ShAmt = isConstOrConstSplat(N0.getOperand(1))) {
if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&		if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
▲ Show 20 Lines • Show All 773 Lines • ▼ Show 20 Lines	if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&

if (SDValue V = foldSetCCWithAnd(VT, N0, N1, Cond, dl, DCI))		if (SDValue V = foldSetCCWithAnd(VT, N0, N1, Cond, dl, DCI))
return V;		return V;
}		}

// Fold remainder of division by a constant.		// Fold remainder of division by a constant.
if ((N0.getOpcode() == ISD::UREM \|\| N0.getOpcode() == ISD::SREM) &&		if ((N0.getOpcode() == ISD::UREM \|\| N0.getOpcode() == ISD::SREM) &&
N0.hasOneUse() && (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE)) {		N0.hasOneUse() && (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE)) {
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

// When division is cheap or optimizing for minimum size,		// When division is cheap or optimizing for minimum size,
// fall through to DIVREM creation by skipping this fold.		// fall through to DIVREM creation by skipping this fold.
if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttr(Attribute::MinSize)) {		if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttr(Attribute::MinSize)) {
if (N0.getOpcode() == ISD::UREM) {		if (N0.getOpcode() == ISD::UREM) {
if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))		if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
return Folded;		return Folded;
} else if (N0.getOpcode() == ISD::SREM) {		} else if (N0.getOpcode() == ISD::SREM) {
if (SDValue Folded = buildSREMEqFold(VT, N0, N1, Cond, DCI, dl))		if (SDValue Folded = buildSREMEqFold(VT, N0, N1, Cond, DCI, dl))
▲ Show 20 Lines • Show All 5,463 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll

	Show First 20 Lines • Show All 185 Lines • ▼ Show 20 Lines
	}			}

	define void @typei1_orig(i64 %a, ptr %p, ptr %q) {			define void @typei1_orig(i64 %a, ptr %p, ptr %q) {
	; CHECK-LABEL: typei1_orig:			; CHECK-LABEL: typei1_orig:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: cmp x0, #0			; CHECK-NEXT: cmp x0, #0
	; CHECK-NEXT: ldr q0, [x2]			; CHECK-NEXT: ldr q0, [x2]
	; CHECK-NEXT: cset w8, gt			; CHECK-NEXT: cset w8, gt
	; CHECK-NEXT: neg v0.8h, v0.8h			; CHECK-NEXT: movi v2.2d, #0000000000000000
	; CHECK-NEXT: dup v1.8h, w8
	; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
	; CHECK-NEXT: movi v1.2d, #0000000000000000
	; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h			; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h
				; CHECK-NEXT: dup v1.8h, w8
				; CHECK-NEXT: cmeq v1.8h, v1.8h, #0
				; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b
	; CHECK-NEXT: xtn v0.8b, v0.8h			; CHECK-NEXT: xtn v0.8b, v0.8h
	; CHECK-NEXT: mov v0.d[1], v1.d[0]			; CHECK-NEXT: mov v0.d[1], v2.d[0]
	; CHECK-NEXT: str q0, [x1]			; CHECK-NEXT: str q0, [x1]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%tmp = xor <16 x i1> zeroinitializer, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>			%tmp = xor <16 x i1> zeroinitializer, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
	%tmp6 = load <8 x i16>, ptr %q, align 2			%tmp6 = load <8 x i16>, ptr %q, align 2
	%tmp7 = sub <8 x i16> zeroinitializer, %tmp6			%tmp7 = sub <8 x i16> zeroinitializer, %tmp6
	%tmp8 = shufflevector <8 x i16> %tmp7, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>			%tmp8 = shufflevector <8 x i16> %tmp7, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	%tmp9 = icmp slt i64 0, %a			%tmp9 = icmp slt i64 0, %a
	%tmp10 = zext i1 %tmp9 to i16			%tmp10 = zext i1 %tmp9 to i16
	▲ Show 20 Lines • Show All 130 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/mul-cmp.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=aarch64-- \| FileCheck %s			; RUN: llc < %s -mtriple=aarch64-- \| FileCheck %s

				; With no-wrap:
				; (X * Y) == 0 --> (X == 0) \|\| (Y == 0)
				; (X * Y) != 0 --> (X != 0) && (Y != 0)

	define i1 @mul_nsw_eq0_i8(i8 %x, i8 %y) {			define i1 @mul_nsw_eq0_i8(i8 %x, i8 %y) {
	; CHECK-LABEL: mul_nsw_eq0_i8:			; CHECK-LABEL: mul_nsw_eq0_i8:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mul w8, w0, w1			; CHECK-NEXT: tst w1, #0xff
	; CHECK-NEXT: tst w8, #0xff			; CHECK-NEXT: cset w8, eq
	; CHECK-NEXT: cset w0, eq			; CHECK-NEXT: tst w0, #0xff
				; CHECK-NEXT: cset w9, eq
				; CHECK-NEXT: orr w0, w9, w8
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m = mul nsw i8 %x, %y			%m = mul nsw i8 %x, %y
	%r = icmp eq i8 %m, 0			%r = icmp eq i8 %m, 0
	ret i1 %r			ret i1 %r
	}			}

				; negative test - not valid if mul can overflow

	define i1 @mul_eq0_i8(i8 %x, i8 %y) {			define i1 @mul_eq0_i8(i8 %x, i8 %y) {
	; CHECK-LABEL: mul_eq0_i8:			; CHECK-LABEL: mul_eq0_i8:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mul w8, w0, w1			; CHECK-NEXT: mul w8, w0, w1
	; CHECK-NEXT: tst w8, #0xff			; CHECK-NEXT: tst w8, #0xff
	; CHECK-NEXT: cset w0, eq			; CHECK-NEXT: cset w0, eq
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m = mul i8 %x, %y			%m = mul i8 %x, %y
	%r = icmp eq i8 %m, 0			%r = icmp eq i8 %m, 0
	ret i1 %r			ret i1 %r
	}			}

				; negative test - don't try with minsize

	define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize {			define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize {
	; CHECK-LABEL: mul_nsw_eq0_i8_size:			; CHECK-LABEL: mul_nsw_eq0_i8_size:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mul w8, w0, w1			; CHECK-NEXT: mul w8, w0, w1
	; CHECK-NEXT: tst w8, #0xff			; CHECK-NEXT: tst w8, #0xff
	; CHECK-NEXT: cset w0, eq			; CHECK-NEXT: cset w0, eq
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m = mul nsw i8 %x, %y			%m = mul nsw i8 %x, %y
	%r = icmp eq i8 %m, 0			%r = icmp eq i8 %m, 0
	ret i1 %r			ret i1 %r
	}			}

	define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) {			define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) {
	; CHECK-LABEL: mul_nsw_ne0_i16:			; CHECK-LABEL: mul_nsw_ne0_i16:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mul w8, w0, w1			; CHECK-NEXT: tst w1, #0xffff
	; CHECK-NEXT: tst w8, #0xffff			; CHECK-NEXT: cset w8, ne
	; CHECK-NEXT: cset w0, ne			; CHECK-NEXT: tst w0, #0xffff
				; CHECK-NEXT: cset w9, ne
				; CHECK-NEXT: and w0, w9, w8
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m = mul nsw i16 %x, %y			%m = mul nsw i16 %x, %y
	%r = icmp ne i16 %m, 0			%r = icmp ne i16 %m, 0
	ret i1 %r			ret i1 %r
	}			}

	define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) {			define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) {
	; CHECK-LABEL: mul_nuw_eq0_i32:			; CHECK-LABEL: mul_nuw_eq0_i32:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mul w8, w0, w1			; CHECK-NEXT: cmp w0, #0
	; CHECK-NEXT: cmp w8, #0			; CHECK-NEXT: ccmp w1, #0, #4, ne
	; CHECK-NEXT: cset w0, eq			; CHECK-NEXT: cset w0, eq
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m = mul nuw i32 %x, %y			%m = mul nuw i32 %x, %y
	%r = icmp eq i32 %m, 0			%r = icmp eq i32 %m, 0
	ret i1 %r			ret i1 %r
	}			}

	define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) {			define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) {
	; CHECK-LABEL: mul_nsw_nuw_ne0_i64:			; CHECK-LABEL: mul_nsw_nuw_ne0_i64:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mul x8, x0, x1			; CHECK-NEXT: cmp x0, #0
	; CHECK-NEXT: cmp x8, #0			; CHECK-NEXT: ccmp x1, #0, #4, ne
	; CHECK-NEXT: cset w0, ne			; CHECK-NEXT: cset w0, ne
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m = mul nsw nuw i64 %x, %y			%m = mul nsw nuw i64 %x, %y
	%r = icmp ne i64 %m, 0			%r = icmp ne i64 %m, 0
	ret i1 %r			ret i1 %r
	}			}

	define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) {			define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) {
	; CHECK-LABEL: mul_nuw_eq0_v16i8:			; CHECK-LABEL: mul_nuw_eq0_v16i8:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b			; CHECK-NEXT: cmeq v1.16b, v1.16b, #0
	; CHECK-NEXT: cmeq v0.16b, v0.16b, #0			; CHECK-NEXT: cmeq v0.16b, v0.16b, #0
				; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m = mul nuw <16 x i8> %x, %y			%m = mul nuw <16 x i8> %x, %y
	%r = icmp eq <16 x i8> %m, zeroinitializer			%r = icmp eq <16 x i8> %m, zeroinitializer
	ret <16 x i1> %r			ret <16 x i1> %r
	}			}

	define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) {			define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) {
	; CHECK-LABEL: mul_nsw_ne0_v4i32:			; CHECK-LABEL: mul_nsw_ne0_v4i32:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
	; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s			; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s
				; CHECK-NEXT: cmeq v1.4s, v1.4s, #0
				; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b
	; CHECK-NEXT: xtn v0.4h, v0.4s			; CHECK-NEXT: xtn v0.4h, v0.4s
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m = mul nsw <4 x i32> %x, %y			%m = mul nsw <4 x i32> %x, %y
	%r = icmp ne <4 x i32> %m, zeroinitializer			%r = icmp ne <4 x i32> %m, zeroinitializer
	ret <4 x i1> %r			ret <4 x i1> %r
	}			}

				; negative test - don't try with minsize

	define <4 x i1> @mul_nsw_ne0_v4i32_size(<4 x i32> %x, <4 x i32> %y) minsize {			define <4 x i1> @mul_nsw_ne0_v4i32_size(<4 x i32> %x, <4 x i32> %y) minsize {
	; CHECK-LABEL: mul_nsw_ne0_v4i32_size:			; CHECK-LABEL: mul_nsw_ne0_v4i32_size:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s			; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
	; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s			; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s
	; CHECK-NEXT: xtn v0.4h, v0.4s			; CHECK-NEXT: xtn v0.4h, v0.4s
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m = mul nsw <4 x i32> %x, %y			%m = mul nsw <4 x i32> %x, %y
	%r = icmp ne <4 x i32> %m, zeroinitializer			%r = icmp ne <4 x i32> %m, zeroinitializer
	ret <4 x i1> %r			ret <4 x i1> %r
	}			}

llvm/test/CodeGen/X86/mul-cmp.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-- -mattr=sse \| FileCheck %s --check-prefixes=CHECK,SSE			; RUN: llc < %s -mtriple=x86_64-- -mattr=sse \| FileCheck %s --check-prefixes=CHECK,SSE
	; RUN: llc < %s -mtriple=x86_64-- -mattr=avx \| FileCheck %s --check-prefixes=CHECK,AVX			; RUN: llc < %s -mtriple=x86_64-- -mattr=avx \| FileCheck %s --check-prefixes=CHECK,AVX

				; With no-wrap:
				; (X * Y) == 0 --> (X == 0) \|\| (Y == 0)
				; (X * Y) != 0 --> (X != 0) && (Y != 0)

	define i1 @mul_nsw_eq0_i8(i8 %x, i8 %y) {			define i1 @mul_nsw_eq0_i8(i8 %x, i8 %y) {
	; CHECK-LABEL: mul_nsw_eq0_i8:			; CHECK-LABEL: mul_nsw_eq0_i8:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movl %edi, %eax			; CHECK-NEXT: testb %sil, %sil
	; CHECK-NEXT: # kill: def $al killed $al killed $eax			; CHECK-NEXT: sete %cl
	; CHECK-NEXT: mulb %sil			; CHECK-NEXT: testb %dil, %dil
	; CHECK-NEXT: testb %al, %al
	; CHECK-NEXT: sete %al			; CHECK-NEXT: sete %al
				; CHECK-NEXT: orb %cl, %al
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%m = mul nsw i8 %x, %y			%m = mul nsw i8 %x, %y
	%r = icmp eq i8 %m, 0			%r = icmp eq i8 %m, 0
	ret i1 %r			ret i1 %r
	}			}

				; negative test - not valid if mul can overflow

	define i1 @mul_eq0_i8(i8 %x, i8 %y) {			define i1 @mul_eq0_i8(i8 %x, i8 %y) {
	; CHECK-LABEL: mul_eq0_i8:			; CHECK-LABEL: mul_eq0_i8:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movl %edi, %eax			; CHECK-NEXT: movl %edi, %eax
	; CHECK-NEXT: # kill: def $al killed $al killed $eax			; CHECK-NEXT: # kill: def $al killed $al killed $eax
	; CHECK-NEXT: mulb %sil			; CHECK-NEXT: mulb %sil
	; CHECK-NEXT: testb %al, %al			; CHECK-NEXT: testb %al, %al
	; CHECK-NEXT: sete %al			; CHECK-NEXT: sete %al
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%m = mul i8 %x, %y			%m = mul i8 %x, %y
	%r = icmp eq i8 %m, 0			%r = icmp eq i8 %m, 0
	ret i1 %r			ret i1 %r
	}			}

				; negative test - don't try with minsize

	define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize {			define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize {
	; CHECK-LABEL: mul_nsw_eq0_i8_size:			; CHECK-LABEL: mul_nsw_eq0_i8_size:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movl %edi, %eax			; CHECK-NEXT: movl %edi, %eax
	; CHECK-NEXT: # kill: def $al killed $al killed $eax			; CHECK-NEXT: # kill: def $al killed $al killed $eax
	; CHECK-NEXT: mulb %sil			; CHECK-NEXT: mulb %sil
	; CHECK-NEXT: testb %al, %al			; CHECK-NEXT: testb %al, %al
	; CHECK-NEXT: sete %al			; CHECK-NEXT: sete %al
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%m = mul nsw i8 %x, %y			%m = mul nsw i8 %x, %y
	%r = icmp eq i8 %m, 0			%r = icmp eq i8 %m, 0
	ret i1 %r			ret i1 %r
	}			}

	define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) {			define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) {
	; CHECK-LABEL: mul_nsw_ne0_i16:			; CHECK-LABEL: mul_nsw_ne0_i16:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: imull %esi, %edi			; CHECK-NEXT: testw %si, %si
				; CHECK-NEXT: setne %cl
	; CHECK-NEXT: testw %di, %di			; CHECK-NEXT: testw %di, %di
	; CHECK-NEXT: setne %al			; CHECK-NEXT: setne %al
				; CHECK-NEXT: andb %cl, %al
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%m = mul nsw i16 %x, %y			%m = mul nsw i16 %x, %y
	%r = icmp ne i16 %m, 0			%r = icmp ne i16 %m, 0
	ret i1 %r			ret i1 %r
	}			}

	define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) {			define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) {
	; CHECK-LABEL: mul_nuw_eq0_i32:			; CHECK-LABEL: mul_nuw_eq0_i32:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: imull %esi, %edi			; CHECK-NEXT: testl %esi, %esi
				; CHECK-NEXT: sete %cl
	; CHECK-NEXT: testl %edi, %edi			; CHECK-NEXT: testl %edi, %edi
	; CHECK-NEXT: sete %al			; CHECK-NEXT: sete %al
				; CHECK-NEXT: orb %cl, %al
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%m = mul nuw i32 %x, %y			%m = mul nuw i32 %x, %y
	%r = icmp eq i32 %m, 0			%r = icmp eq i32 %m, 0
	ret i1 %r			ret i1 %r
	}			}

	define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) {			define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) {
	; CHECK-LABEL: mul_nsw_nuw_ne0_i64:			; CHECK-LABEL: mul_nsw_nuw_ne0_i64:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: imulq %rsi, %rdi			; CHECK-NEXT: testq %rsi, %rsi
				; CHECK-NEXT: setne %cl
	; CHECK-NEXT: testq %rdi, %rdi			; CHECK-NEXT: testq %rdi, %rdi
	; CHECK-NEXT: setne %al			; CHECK-NEXT: setne %al
				; CHECK-NEXT: andb %cl, %al
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%m = mul nsw nuw i64 %x, %y			%m = mul nsw nuw i64 %x, %y
	%r = icmp ne i64 %m, 0			%r = icmp ne i64 %m, 0
	ret i1 %r			ret i1 %r
	}			}

	define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) {			define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) {
	; SSE-LABEL: mul_nuw_eq0_v16i8:			; SSE-LABEL: mul_nuw_eq0_v16i8:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movdqa %xmm1, %xmm2			; SSE-NEXT: pxor %xmm2, %xmm2
	; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]			; SSE-NEXT: pcmpeqb %xmm2, %xmm1
	; SSE-NEXT: movdqa %xmm0, %xmm3			; SSE-NEXT: pcmpeqb %xmm2, %xmm0
	; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]			; SSE-NEXT: por %xmm1, %xmm0
	; SSE-NEXT: pmullw %xmm2, %xmm3
	; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
	; SSE-NEXT: pand %xmm2, %xmm3
	; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE-NEXT: pmullw %xmm1, %xmm0
	; SSE-NEXT: pand %xmm2, %xmm0
	; SSE-NEXT: packuswb %xmm3, %xmm0
	; SSE-NEXT: pxor %xmm1, %xmm1
	; SSE-NEXT: pcmpeqb %xmm1, %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: mul_nuw_eq0_v16i8:			; AVX-LABEL: mul_nuw_eq0_v16i8:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]			; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]			; AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
	; AVX-NEXT: vpmullw %xmm2, %xmm3, %xmm2			; AVX-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
	; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]			; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
				goldstein.w.nUnsubmitted Not Done Reply Inline Actions Wow nice! goldstein.w.n: Wow nice!
	; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
	; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
	; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0
	; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
	; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%m = mul nuw <16 x i8> %x, %y			%m = mul nuw <16 x i8> %x, %y
	%r = icmp eq <16 x i8> %m, zeroinitializer			%r = icmp eq <16 x i8> %m, zeroinitializer
	ret <16 x i1> %r			ret <16 x i1> %r
	}			}

	define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) {			define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) {
	; SSE-LABEL: mul_nsw_ne0_v4i32:			; SSE-LABEL: mul_nsw_ne0_v4i32:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]			; SSE-NEXT: pxor %xmm2, %xmm2
	; SSE-NEXT: pmuludq %xmm1, %xmm0			; SSE-NEXT: pcmpeqd %xmm2, %xmm0
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; SSE-NEXT: pcmpeqd %xmm2, %xmm1
	; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]			; SSE-NEXT: pcmpeqd %xmm2, %xmm2
	; SSE-NEXT: pmuludq %xmm2, %xmm1			; SSE-NEXT: pxor %xmm1, %xmm2
	; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]			; SSE-NEXT: pandn %xmm2, %xmm0
	; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSE-NEXT: pxor %xmm1, %xmm1
	; SSE-NEXT: pcmpeqd %xmm0, %xmm1
	; SSE-NEXT: pcmpeqd %xmm0, %xmm0
	; SSE-NEXT: pxor %xmm1, %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: mul_nsw_ne0_v4i32:			; AVX-LABEL: mul_nsw_ne0_v4i32:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0			; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1			; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
	; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0			; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
	; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1			; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
	; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0			; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
				; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%m = mul nsw <4 x i32> %x, %y			%m = mul nsw <4 x i32> %x, %y
	%r = icmp ne <4 x i32> %m, zeroinitializer			%r = icmp ne <4 x i32> %m, zeroinitializer
	ret <4 x i1> %r			ret <4 x i1> %r
	}			}

				; negative test - don't try with minsize
				; TODO: SSE would be much smaller if decomposed.

	define <4 x i1> @mul_nsw_ne0_v4i32_size(<4 x i32> %x, <4 x i32> %y) minsize {			define <4 x i1> @mul_nsw_ne0_v4i32_size(<4 x i32> %x, <4 x i32> %y) minsize {
	; SSE-LABEL: mul_nsw_ne0_v4i32_size:			; SSE-LABEL: mul_nsw_ne0_v4i32_size:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]			; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
	; SSE-NEXT: pmuludq %xmm1, %xmm0			; SSE-NEXT: pmuludq %xmm1, %xmm0
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]			; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
	; SSE-NEXT: pmuludq %xmm2, %xmm1			; SSE-NEXT: pmuludq %xmm2, %xmm1
	Show All 20 Lines