Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2694,6 +2694,17 @@ isCondCodeLegal(SwappedCC, N0.getSimpleValueType()))) return DAG.getSetCC(dl, VT, N1, N0, SwappedCC); + // If we have a subtract with the same 2 non-constant operands as this setcc + // -- but in reverse order -- then try to commute the operands of this setcc + // to match. A matching pair of setcc (cmp) and sub may be combined into 1 + // instruction on some targets. + if (!isConstOrConstSplat(N0) && !isConstOrConstSplat(N1) && + (DCI.isBeforeLegalizeOps() || + isCondCodeLegal(SwappedCC, N0.getSimpleValueType())) && + DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N1, N0 } ) && + !DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N0, N1 } )) + return DAG.getSetCC(dl, VT, N1, N0, SwappedCC); + if (auto *N1C = dyn_cast(N1.getNode())) { const APInt &C1 = N1C->getAPIntValue(); Index: llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -1037,15 +1037,14 @@ ; ENABLE-NEXT: lsl w8, w0, w1 ; ENABLE-NEXT: lsl w9, w1, w0 ; ENABLE-NEXT: lsr w10, w0, w1 -; ENABLE-NEXT: lsr w11, w1, w0 -; ENABLE-NEXT: add w12, w1, w0 -; ENABLE-NEXT: sub w13, w1, w0 -; ENABLE-NEXT: cmp w0, w1 -; ENABLE-NEXT: add w17, w8, w9 -; ENABLE-NEXT: sub w16, w9, w10 -; ENABLE-NEXT: add w15, w10, w11 -; ENABLE-NEXT: add w14, w11, w12 -; ENABLE-NEXT: b.ge LBB14_2 +; ENABLE-NEXT: lsr w12, w1, w0 +; ENABLE-NEXT: add w15, w1, w0 +; ENABLE-NEXT: subs w17, w1, w0 +; ENABLE-NEXT: sub w11, w9, w10 +; ENABLE-NEXT: add w16, w8, w9 +; ENABLE-NEXT: add w13, w10, w12 +; ENABLE-NEXT: add w14, w12, w15 +; ENABLE-NEXT: b.le LBB14_2 ; ENABLE-NEXT: ; %bb.1: ; %true ; ENABLE-NEXT: str w0, [sp] ; ENABLE-NEXT: ; InlineAsm Start @@ -1055,12 +1054,12 @@ ; ENABLE-NEXT: str w8, [x2] ; ENABLE-NEXT: str w9, [x3] ; ENABLE-NEXT: str w10, [x4] -; ENABLE-NEXT: str w11, [x5] -; ENABLE-NEXT: str w12, [x6] -; ENABLE-NEXT: str w13, [x7] +; ENABLE-NEXT: str w12, [x5] +; ENABLE-NEXT: str w15, [x6] +; ENABLE-NEXT: str w17, [x7] ; ENABLE-NEXT: stp w0, w1, [x2, #4] -; ENABLE-NEXT: stp w17, w16, [x2, #12] -; ENABLE-NEXT: stp w15, w14, [x2, #20] +; ENABLE-NEXT: stp w16, w11, [x2, #12] +; ENABLE-NEXT: stp w13, w14, [x2, #20] ; ENABLE-NEXT: sub sp, x29, #80 ; =80 ; ENABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload ; ENABLE-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload @@ -1097,15 +1096,14 @@ ; DISABLE-NEXT: lsl w8, w0, w1 ; DISABLE-NEXT: lsl w9, w1, w0 ; DISABLE-NEXT: lsr w10, w0, w1 -; DISABLE-NEXT: lsr w11, w1, w0 -; DISABLE-NEXT: add w12, w1, w0 -; DISABLE-NEXT: sub w13, w1, w0 -; DISABLE-NEXT: cmp w0, w1 -; DISABLE-NEXT: add w17, w8, w9 -; DISABLE-NEXT: sub w16, w9, w10 -; DISABLE-NEXT: add w15, w10, w11 -; DISABLE-NEXT: add w14, w11, w12 -; DISABLE-NEXT: b.ge LBB14_2 +; DISABLE-NEXT: lsr w12, w1, w0 +; DISABLE-NEXT: add w15, w1, w0 +; DISABLE-NEXT: subs w17, w1, w0 +; DISABLE-NEXT: sub w11, w9, w10 +; DISABLE-NEXT: add w16, w8, w9 +; DISABLE-NEXT: add w13, w10, w12 +; DISABLE-NEXT: add w14, w12, w15 +; DISABLE-NEXT: b.le LBB14_2 ; DISABLE-NEXT: ; %bb.1: ; %true ; DISABLE-NEXT: str w0, [sp] ; DISABLE-NEXT: ; InlineAsm Start @@ -1115,12 +1113,12 @@ ; DISABLE-NEXT: str w8, [x2] ; DISABLE-NEXT: str w9, [x3] ; DISABLE-NEXT: str w10, [x4] -; DISABLE-NEXT: str w11, [x5] -; DISABLE-NEXT: str w12, [x6] -; DISABLE-NEXT: str w13, [x7] +; DISABLE-NEXT: str w12, [x5] +; DISABLE-NEXT: str w15, [x6] +; DISABLE-NEXT: str w17, [x7] ; DISABLE-NEXT: stp w0, w1, [x2, #4] -; DISABLE-NEXT: stp w17, w16, [x2, #12] -; DISABLE-NEXT: stp w15, w14, [x2, #20] +; DISABLE-NEXT: stp w16, w11, [x2, #12] +; DISABLE-NEXT: stp w13, w14, [x2, #20] ; DISABLE-NEXT: sub sp, x29, #80 ; =80 ; DISABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload ; DISABLE-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload Index: llvm/test/CodeGen/AArch64/cgp-usubo.ll =================================================================== --- llvm/test/CodeGen/AArch64/cgp-usubo.ll +++ llvm/test/CodeGen/AArch64/cgp-usubo.ll @@ -21,11 +21,9 @@ define i1 @usubo_ugt_i32(i32 %x, i32 %y, i32* %p) nounwind { ; CHECK-LABEL: usubo_ugt_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w1, w0 -; CHECK-NEXT: cset w8, hi -; CHECK-NEXT: sub w9, w0, w1 -; CHECK-NEXT: mov w0, w8 -; CHECK-NEXT: str w9, [x2] +; CHECK-NEXT: subs w8, w0, w1 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: str w8, [x2] ; CHECK-NEXT: ret %ov = icmp ugt i32 %y, %x %s = sub i32 %x, %y Index: llvm/test/CodeGen/Lanai/sub-cmp-peephole.ll =================================================================== --- llvm/test/CodeGen/Lanai/sub-cmp-peephole.ll +++ llvm/test/CodeGen/Lanai/sub-cmp-peephole.ll @@ -25,7 +25,7 @@ ; CHECK-NEXT: add %sp, 0x8, %fp ; CHECK-NEXT: sub %sp, 0x8, %sp ; CHECK-NEXT: sub.f %r7, %r6, %r3 -; CHECK-NEXT: sel.lt %r3, %r0, %rv +; CHECK-NEXT: sel.gt %r3, %r0, %rv ; CHECK-NEXT: ld -4[%fp], %pc ! return ; CHECK-NEXT: add %fp, 0x0, %sp ; CHECK-NEXT: ld -8[%fp], %fp @@ -59,7 +59,7 @@ ; CHECK-NEXT: add %sp, 0x8, %fp ; CHECK-NEXT: sub %sp, 0x8, %sp ; CHECK-NEXT: sub.f %r7, %r6, %r3 -; CHECK-NEXT: sel.ult %r3, %r0, %rv +; CHECK-NEXT: sel.ugt %r3, %r0, %rv ; CHECK-NEXT: ld -4[%fp], %pc ! return ; CHECK-NEXT: add %fp, 0x0, %sp ; CHECK-NEXT: ld -8[%fp], %fp @@ -75,11 +75,11 @@ ; CHECK: ! %bb.0: ! %entry ; CHECK-NEXT: st %fp, [--%sp] ; CHECK-NEXT: add %sp, 0x8, %fp -; CHECK-NEXT: sub %sp, 0x8, %sp -; CHECK-NEXT: sub.f %r7, %r6, %r0 +; CHECK-NEXT: sub.f %r6, %r7, %rv ; CHECK-NEXT: bne .LBB4_2 -; CHECK-NEXT: sub %r6, %r7, %rv +; CHECK-NEXT: sub %sp, 0x8, %sp ; CHECK-NEXT: .LBB4_1: ! %if.then +; CHECK-NEXT: sub.f %r7, %r6, %r0 ; CHECK-NEXT: sel.gt %rv, %r6, %rv ; CHECK-NEXT: .LBB4_2: ! %if.else ; CHECK-NEXT: ld -4[%fp], %pc ! return Index: llvm/test/CodeGen/X86/jump_sign.ll =================================================================== --- llvm/test/CodeGen/X86/jump_sign.ll +++ llvm/test/CodeGen/X86/jump_sign.ll @@ -48,11 +48,10 @@ define i32 @func_h(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: func_h: ; CHECK: # %bb.0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: subl %ecx, %eax -; CHECK-NEXT: cmovlel %edx, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cmovlel %ecx, %eax ; CHECK-NEXT: retl %cmp = icmp slt i32 %b, %a %sub = sub nsw i32 %a, %b @@ -91,11 +90,10 @@ define i32 @func_k(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: func_k: ; CHECK: # %bb.0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: subl %ecx, %eax -; CHECK-NEXT: cmovbel %edx, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cmovbel %ecx, %eax ; CHECK-NEXT: retl %cmp = icmp ult i32 %b, %a %sub = sub i32 %a, %b @@ -108,10 +106,9 @@ ; CHECK-LABEL: func_l: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: subl %ecx, %eax -; CHECK-NEXT: cmovlel %edx, %eax +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cmovlel %ecx, %eax ; CHECK-NEXT: retl %cmp = icmp slt i32 %b, %a %sub = sub nsw i32 %a, %b @@ -139,16 +136,14 @@ ; CHECK-LABEL: func_l2: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: subl %edx, %ecx -; CHECK-NEXT: cmpl %eax, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: subl %edx, %eax ; CHECK-NEXT: jne .LBB8_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: cmovgl %ecx, %eax -; CHECK-NEXT: retl +; CHECK-NEXT: cmpl %ecx, %edx +; CHECK-NEXT: cmovlel %ecx, %eax ; CHECK-NEXT: .LBB8_2: # %if.else -; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retl %cmp = icmp eq i32 %b, %a %sub = sub nsw i32 %a, %b @@ -166,9 +161,8 @@ define i32 @func_l3(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: func_l3: ; CHECK: # %bb.0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: jge .LBB9_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: retl @@ -192,11 +186,10 @@ define i32 @func_l4(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: func_l4: ; CHECK: # %bb.0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: subl %ecx, %eax -; CHECK-NEXT: cmovll %edx, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cmovll %ecx, %eax ; CHECK-NEXT: retl %cmp = icmp sgt i32 %b, %a %sub = sub i32 %a, %b Index: llvm/test/CodeGen/X86/psubus.ll =================================================================== --- llvm/test/CodeGen/X86/psubus.ll +++ llvm/test/CodeGen/X86/psubus.ll @@ -1037,14 +1037,14 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pmaxud %xmm0, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pminud %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pmaxud %xmm3, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pminud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: packssdw %xmm6, %xmm4 ; SSE41-NEXT: psubd %xmm2, %xmm3 @@ -1061,15 +1061,15 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpmaxud %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm5 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpandn %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -1080,8 +1080,8 @@ ; AVX2-LABEL: test16: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] @@ -1093,7 +1093,7 @@ ; AVX512-LABEL: test16: ; AVX512: # %bb.0: # %vector.ph ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1 +; AVX512-NEXT: vpcmpnleud %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper