diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4207,6 +4207,7 @@ SelectionDAG &DAG = DCI.DAG; const DataLayout &Layout = DAG.getDataLayout(); EVT OpVT = N0.getValueType(); + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); // Constant fold or commute setcc. if (SDValue Fold = DAG.FoldSetCC(VT, N0, N1, Cond, dl)) @@ -4251,6 +4252,23 @@ if (SDValue V = simplifySetCCWithCTPOP(*this, VT, N0, C1, Cond, dl, DAG)) return V; + // For equality to 0 of a no-wrap multiply, decompose and test each op: + // X * Y == 0 --> (X == 0) || (Y == 0) + // X * Y != 0 --> (X != 0) && (Y != 0) + // TODO: This bails out if minsize is set, but if the target doesn't have a + // single instruction multiply for this type, it would likely be + // smaller to decompose. + if (C1.isZero() && (Cond == ISD::SETEQ || Cond == ISD::SETNE) && + N0.getOpcode() == ISD::MUL && N0.hasOneUse() && + (N0->getFlags().hasNoUnsignedWrap() || + N0->getFlags().hasNoSignedWrap()) && + !Attr.hasFnAttr(Attribute::MinSize)) { + SDValue IsXZero = DAG.getSetCC(dl, VT, N0.getOperand(0), N1, Cond); + SDValue IsYZero = DAG.getSetCC(dl, VT, N0.getOperand(1), N1, Cond); + unsigned LogicOp = Cond == ISD::SETEQ ? ISD::OR : ISD::AND; + return DAG.getNode(LogicOp, dl, VT, IsXZero, IsYZero); + } + // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an // equality comparison, then we're just comparing whether X itself is // zero. @@ -5040,8 +5058,6 @@ // Fold remainder of division by a constant. if ((N0.getOpcode() == ISD::UREM || N0.getOpcode() == ISD::SREM) && N0.hasOneUse() && (Cond == ISD::SETEQ || Cond == ISD::SETNE)) { - AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); - // When division is cheap or optimizing for minimum size, // fall through to DIVREM creation by skipping this fold. if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttr(Attribute::MinSize)) { diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -191,13 +191,13 @@ ; CHECK-NEXT: cmp x0, #0 ; CHECK-NEXT: ldr q0, [x2] ; CHECK-NEXT: cset w8, gt -; CHECK-NEXT: neg v0.8h, v0.8h -; CHECK-NEXT: dup v1.8h, w8 -; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h +; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: cmeq v1.8h, v1.8h, #0 +; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b ; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.d[1], v2.d[0] ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %tmp = xor <16 x i1> zeroinitializer, diff --git a/llvm/test/CodeGen/AArch64/mul-cmp.ll b/llvm/test/CodeGen/AArch64/mul-cmp.ll --- a/llvm/test/CodeGen/AArch64/mul-cmp.ll +++ b/llvm/test/CodeGen/AArch64/mul-cmp.ll @@ -1,18 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s +; With no-wrap: +; (X * Y) == 0 --> (X == 0) || (Y == 0) +; (X * Y) != 0 --> (X != 0) && (Y != 0) + define i1 @mul_nsw_eq0_i8(i8 %x, i8 %y) { ; CHECK-LABEL: mul_nsw_eq0_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mul w8, w0, w1 -; CHECK-NEXT: tst w8, #0xff -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: tst w1, #0xff +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: tst w0, #0xff +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %m = mul nsw i8 %x, %y %r = icmp eq i8 %m, 0 ret i1 %r } +; negative test - not valid if mul can overflow + define i1 @mul_eq0_i8(i8 %x, i8 %y) { ; CHECK-LABEL: mul_eq0_i8: ; CHECK: // %bb.0: @@ -25,6 +33,8 @@ ret i1 %r } +; negative test - don't try with minsize + define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize { ; CHECK-LABEL: mul_nsw_eq0_i8_size: ; CHECK: // %bb.0: @@ -40,9 +50,11 @@ define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) { ; CHECK-LABEL: mul_nsw_ne0_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mul w8, w0, w1 -; CHECK-NEXT: tst w8, #0xffff -; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: tst w1, #0xffff +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: tst w0, #0xffff +; CHECK-NEXT: cset w9, ne +; CHECK-NEXT: and w0, w9, w8 ; CHECK-NEXT: ret %m = mul nsw i16 %x, %y %r = icmp ne i16 %m, 0 @@ -52,8 +64,8 @@ define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) { ; CHECK-LABEL: mul_nuw_eq0_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mul w8, w0, w1 -; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ccmp w1, #0, #4, ne ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %m = mul nuw i32 %x, %y @@ -64,8 +76,8 @@ define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) { ; CHECK-LABEL: mul_nsw_nuw_ne0_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mul x8, x0, x1 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x0, #0 +; CHECK-NEXT: ccmp x1, #0, #4, ne ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %m = mul nsw nuw i64 %x, %y @@ -76,8 +88,9 @@ define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: mul_nuw_eq0_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmeq v1.16b, v1.16b, #0 ; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %m = mul nuw <16 x i8> %x, %y %r = icmp eq <16 x i8> %m, zeroinitializer @@ -87,8 +100,9 @@ define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: mul_nsw_ne0_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s +; CHECK-NEXT: cmeq v1.4s, v1.4s, #0 +; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %m = mul nsw <4 x i32> %x, %y @@ -96,6 +110,8 @@ ret <4 x i1> %r } +; negative test - don't try with minsize + define <4 x i1> @mul_nsw_ne0_v4i32_size(<4 x i32> %x, <4 x i32> %y) minsize { ; CHECK-LABEL: mul_nsw_ne0_v4i32_size: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/X86/mul-cmp.ll b/llvm/test/CodeGen/X86/mul-cmp.ll --- a/llvm/test/CodeGen/X86/mul-cmp.ll +++ b/llvm/test/CodeGen/X86/mul-cmp.ll @@ -2,20 +2,26 @@ ; RUN: llc < %s -mtriple=x86_64-- -mattr=sse | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=CHECK,AVX +; With no-wrap: +; (X * Y) == 0 --> (X == 0) || (Y == 0) +; (X * Y) != 0 --> (X != 0) && (Y != 0) + define i1 @mul_nsw_eq0_i8(i8 %x, i8 %y) { ; CHECK-LABEL: mul_nsw_eq0_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: # kill: def $al killed $al killed $eax -; CHECK-NEXT: mulb %sil -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %sil, %sil +; CHECK-NEXT: sete %cl +; CHECK-NEXT: testb %dil, %dil ; CHECK-NEXT: sete %al +; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: retq %m = mul nsw i8 %x, %y %r = icmp eq i8 %m, 0 ret i1 %r } +; negative test - not valid if mul can overflow + define i1 @mul_eq0_i8(i8 %x, i8 %y) { ; CHECK-LABEL: mul_eq0_i8: ; CHECK: # %bb.0: @@ -30,6 +36,8 @@ ret i1 %r } +; negative test - don't try with minsize + define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize { ; CHECK-LABEL: mul_nsw_eq0_i8_size: ; CHECK: # %bb.0: @@ -47,9 +55,11 @@ define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) { ; CHECK-LABEL: mul_nsw_ne0_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: imull %esi, %edi +; CHECK-NEXT: testw %si, %si +; CHECK-NEXT: setne %cl ; CHECK-NEXT: testw %di, %di ; CHECK-NEXT: setne %al +; CHECK-NEXT: andb %cl, %al ; CHECK-NEXT: retq %m = mul nsw i16 %x, %y %r = icmp ne i16 %m, 0 @@ -59,9 +69,11 @@ define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) { ; CHECK-LABEL: mul_nuw_eq0_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: imull %esi, %edi +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: sete %cl ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: sete %al +; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: retq %m = mul nuw i32 %x, %y %r = icmp eq i32 %m, 0 @@ -71,9 +83,11 @@ define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) { ; CHECK-LABEL: mul_nsw_nuw_ne0_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: imulq %rsi, %rdi +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: setne %cl ; CHECK-NEXT: testq %rdi, %rdi ; CHECK-NEXT: setne %al +; CHECK-NEXT: andb %cl, %al ; CHECK-NEXT: retq %m = mul nsw nuw i64 %x, %y %r = icmp ne i64 %m, 0 @@ -83,36 +97,18 @@ define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) { ; SSE-LABEL: mul_nuw_eq0_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pmullw %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_nuw_eq0_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %m = mul nuw <16 x i8> %x, %y %r = icmp eq <16 x i8> %m, zeroinitializer @@ -122,32 +118,31 @@ define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: mul_nsw_ne0_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_nsw_ne0_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %m = mul nsw <4 x i32> %x, %y %r = icmp ne <4 x i32> %m, zeroinitializer ret <4 x i1> %r } +; negative test - don't try with minsize +; TODO: SSE would be much smaller if decomposed. + define <4 x i1> @mul_nsw_ne0_v4i32_size(<4 x i32> %x, <4 x i32> %y) minsize { ; SSE-LABEL: mul_nsw_ne0_v4i32_size: ; SSE: # %bb.0: