Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -21006,6 +21006,48 @@ return SDValue(); } +/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. +/// Match: +/// (X86or (X86setcc) (X86setcc)) +/// (X86cmp (and (X86setcc) (X86setcc)), 0) +static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, + X86::CondCode &CC1, SDValue &Flags, + bool &isAnd) { + SDValue SetCC0, SetCC1; + + if (Cond->getOpcode() == X86ISD::CMP) { + ConstantSDNode *CondOp1C = dyn_cast(Cond->getOperand(1)); + if (!CondOp1C || !CondOp1C->isNullValue()) + return false; + + Cond = Cond->getOperand(0); + } + + switch (Cond->getOpcode()) { + default: return false; + case ISD::AND: + case X86ISD::AND: + isAnd = true; + // fallthru + case ISD::OR: + case X86ISD::OR: + SetCC0 = Cond->getOperand(0); + SetCC1 = Cond->getOperand(1); + break; + }; + + // Make sure we have SETCC nodes, using the same flags value. + if (SetCC0.getOpcode() != X86ISD::SETCC || + SetCC1.getOpcode() != X86ISD::SETCC || + SetCC0->getOperand(1) != SetCC1->getOperand(1)) + return false; + + CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); + CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); + Flags = SetCC0->getOperand(1); + return true; +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -21175,6 +21217,44 @@ } } + // Fold and/or of setcc's to double CMOV: + // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) + // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) + // + // This combine lets us generate: + // cmovcc1 (jcc1 if we don't have CMOV) + // cmovcc2 (same) + // instead of: + // setcc1 + // setcc2 + // and/or + // cmovne (jne if we don't have CMOV) + // When we can't use the CMOV instruction, it might increase branch + // mispredicts. + // When we can use CMOV, or when there is no mispredict, this improves + // throughput and reduces register pressure. + // + if (CC == X86::COND_NE) { + SDValue Flags; + X86::CondCode CC0, CC1; + bool isAndSetCC = false; + if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { + if (isAndSetCC) { + std::swap(FalseOp, TrueOp); + CC0 = X86::GetOppositeBranchCondition(CC0); + CC1 = X86::GetOppositeBranchCondition(CC1); + } + + SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8), + Flags}; + SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); + SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, MVT::i8), Flags}; + SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); + return CMOV; + } + } + return SDValue(); } Index: test/CodeGen/X86/cmovcmov.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/cmovcmov.ll @@ -0,0 +1,264 @@ +; RUN: llc < %s -asm-verbose=false -mtriple=x86_64-unknown-linux | FileCheck %s --check-prefix=CHECK --check-prefix=CMOV +; RUN: llc < %s -asm-verbose=false -mtriple=i686-unknown-linux | FileCheck %s --check-prefix=CHECK --check-prefix=NOCMOV + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Test 2xCMOV patterns exposed after legalization. +; One way to do that is with (select (fcmp une/oeq)), which gets +; legalized to setp/setne. + +; CHECK-LABEL: test_select_fcmp_oeq_i32: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: cmovnel %esi, %edi +; CMOV-NEXT: cmovpl %esi, %edi +; CMOV-NEXT: movl %edi, %eax +; CMOV-NEXT: retq + +; NOCMOV-NEXT: flds 8(%esp) +; NOCMOV-NEXT: flds 4(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 16(%esp), %eax +; NOCMOV-NEXT: movl %eax, %ecx +; NOCMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; NOCMOV-NEXT: leal 12(%esp), %ecx +; NOCMOV-NEXT: [[TBB1]]: +; NOCMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; NOCMOV-NEXT: movl %ecx, %eax +; NOCMOV-NEXT: [[TBB2]]: +; NOCMOV-NEXT: movl (%eax), %eax +; NOCMOV-NEXT: retl +define i32 @test_select_fcmp_oeq_i32(float %a, float %b, i32 %c, i32 %d) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %r = select i1 %cmp, i32 %c, i32 %d + ret i32 %r +} + +; CHECK-LABEL: test_select_fcmp_oeq_i64: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: cmovneq %rsi, %rdi +; CMOV-NEXT: cmovpq %rsi, %rdi +; CMOV-NEXT: movq %rdi, %rax +; CMOV-NEXT: retq + +; NOCMOV-NEXT: flds 8(%esp) +; NOCMOV-NEXT: flds 4(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 20(%esp), %ecx +; NOCMOV-NEXT: movl %ecx, %eax +; NOCMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; NOCMOV-NEXT: leal 12(%esp), %eax +; NOCMOV-NEXT: [[TBB1]]: +; NOCMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; NOCMOV-NEXT: movl %eax, %ecx +; NOCMOV-NEXT: [[TBB2]]: +; NOCMOV-NEXT: movl (%ecx), %eax +; NOCMOV-NEXT: orl $4, %ecx +; NOCMOV-NEXT: movl (%ecx), %edx +; NOCMOV-NEXT: retl +define i64 @test_select_fcmp_oeq_i64(float %a, float %b, i64 %c, i64 %d) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %r = select i1 %cmp, i64 %c, i64 %d + ret i64 %r +} + +; CHECK-LABEL: test_select_fcmp_une_i64: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: cmovneq %rdi, %rsi +; CMOV-NEXT: cmovpq %rdi, %rsi +; CMOV-NEXT: movq %rsi, %rax +; CMOV-NEXT: retq + +; NOCMOV-NEXT: flds 8(%esp) +; NOCMOV-NEXT: flds 4(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 12(%esp), %ecx +; NOCMOV-NEXT: movl %ecx, %eax +; NOCMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; NOCMOV-NEXT: leal 20(%esp), %eax +; NOCMOV-NEXT: [[TBB1]]: +; NOCMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; NOCMOV-NEXT: movl %eax, %ecx +; NOCMOV-NEXT: [[TBB2]]: +; NOCMOV-NEXT: movl (%ecx), %eax +; NOCMOV-NEXT: orl $4, %ecx +; NOCMOV-NEXT: movl (%ecx), %edx +; NOCMOV-NEXT: retl +define i64 @test_select_fcmp_une_i64(float %a, float %b, i64 %c, i64 %d) #0 { +entry: + %cmp = fcmp une float %a, %b + %r = select i1 %cmp, i64 %c, i64 %d + ret i64 %r +} + +; CHECK-LABEL: test_select_fcmp_oeq_f64: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: movaps %xmm3, %xmm0 +; CMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; CMOV-NEXT: movaps %xmm2, %xmm0 +; CMOV-NEXT: [[TBB1]]: +; CMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; CMOV-NEXT: movaps %xmm0, %xmm3 +; CMOV-NEXT: [[TBB2]]: +; CMOV-NEXT: movaps %xmm3, %xmm0 +; CMOV-NEXT: retq + +; NOCMOV-NEXT: flds 8(%esp) +; NOCMOV-NEXT: flds 4(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 20(%esp), %eax +; NOCMOV-NEXT: movl %eax, %ecx +; NOCMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; NOCMOV-NEXT: leal 12(%esp), %ecx +; NOCMOV-NEXT: [[TBB1]]: +; NOCMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; NOCMOV-NEXT: movl %ecx, %eax +; NOCMOV-NEXT: [[TBB2]]: +; NOCMOV-NEXT: fldl (%eax) +; NOCMOV-NEXT: retl +define double @test_select_fcmp_oeq_f64(float %a, float %b, double %c, double %d) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %r = select i1 %cmp, double %c, double %d + ret double %r +} + +; CHECK-LABEL: test_select_fcmp_oeq_v4i32: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: movaps %xmm3, %xmm0 +; CMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; CMOV-NEXT: movaps %xmm2, %xmm0 +; CMOV-NEXT: [[TBB1]]: +; CMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; CMOV-NEXT: movaps %xmm0, %xmm3 +; CMOV-NEXT: [[TBB2]]: +; CMOV-NEXT: movaps %xmm3, %xmm0 +; CMOV-NEXT: retq + +; NOCMOV-NEXT: pushl %ebx +; NOCMOV-NEXT: pushl %edi +; NOCMOV-NEXT: pushl %esi +; NOCMOV-NEXT: flds 24(%esp) +; NOCMOV-NEXT: flds 20(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 44(%esp), %eax +; NOCMOV-NEXT: movl %eax, %ecx +; NOCMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; NOCMOV-NEXT: leal 28(%esp), %ecx +; NOCMOV-NEXT: [[TBB1]]: +; NOCMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; NOCMOV-NEXT: movl %ecx, %eax +; NOCMOV-NEXT: [[TBB2]]: +; NOCMOV-NEXT: movl (%eax), %eax +; NOCMOV-NEXT: leal 48(%esp), %ecx +; NOCMOV-NEXT: movl %ecx, %edx +; NOCMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; NOCMOV-NEXT: leal 32(%esp), %edx +; NOCMOV-NEXT: [[TBB1]]: +; NOCMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; NOCMOV-NEXT: movl %edx, %ecx +; NOCMOV-NEXT: [[TBB2]]: +; NOCMOV-NEXT: movl (%ecx), %ecx +; NOCMOV-NEXT: leal 52(%esp), %edx +; NOCMOV-NEXT: movl %edx, %esi +; NOCMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; NOCMOV-NEXT: leal 36(%esp), %esi +; NOCMOV-NEXT: [[TBB1]]: +; NOCMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; NOCMOV-NEXT: movl %esi, %edx +; NOCMOV-NEXT: [[TBB2]]: +; NOCMOV-NEXT: movl (%edx), %edx +; NOCMOV-NEXT: leal 56(%esp), %esi +; NOCMOV-NEXT: movl %esi, %ebx +; NOCMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; NOCMOV-NEXT: leal 40(%esp), %ebx +; NOCMOV-NEXT: [[TBB1]]: +; NOCMOV-NEXT: movl 16(%esp), %edi +; NOCMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; NOCMOV-NEXT: movl %ebx, %esi +; NOCMOV-NEXT: [[TBB2]]: +; NOCMOV-NEXT: movl (%esi), %esi +; NOCMOV-NEXT: movl %esi, 12(%edi) +; NOCMOV-NEXT: movl %edx, 8(%edi) +; NOCMOV-NEXT: movl %ecx, 4(%edi) +; NOCMOV-NEXT: movl %eax, (%edi) +; NOCMOV-NEXT: popl %esi +; NOCMOV-NEXT: popl %edi +; NOCMOV-NEXT: popl %ebx +; NOCMOV-NEXT: retl $4 +define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <4 x i32> %d) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %r = select i1 %cmp, <4 x i32> %c, <4 x i32> %d + ret <4 x i32> %r +} + +; Also make sure we catch the original code-sequence of interest: + +; CMOV: [[ONE_F32_LCPI:.LCPI.*]]: +; CMOV-NEXT: .long 1065353216 + +; CHECK-LABEL: test_zext_fcmp_une: +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: movss [[ONE_F32_LCPI]](%rip), %xmm0 +; CMOV-NEXT: movaps %xmm0, %xmm1 +; CMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; CMOV-NEXT: xorps %xmm1, %xmm1 +; CMOV-NEXT: [[TBB1]]: +; CMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; CMOV-NEXT: movaps %xmm1, %xmm0 +; CMOV-NEXT: [[TBB2]]: +; CMOV-NEXT: retq + +; NOCMOV: jne +; NOCMOV: jp +define float @test_zext_fcmp_une(float %a, float %b) #0 { +entry: + %cmp = fcmp une float %a, %b + %conv1 = zext i1 %cmp to i32 + %conv2 = sitofp i32 %conv1 to float + ret float %conv2 +} + +; CMOV: [[ONE_F32_LCPI:.LCPI.*]]: +; CMOV-NEXT: .long 1065353216 + +; CHECK-LABEL: test_zext_fcmp_oeq: +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: xorps %xmm0, %xmm0 +; CMOV-NEXT: xorps %xmm1, %xmm1 +; CMOV-NEXT: jne [[TBB1:.LBB[0-9_]+]] +; CMOV-NEXT: movss [[ONE_F32_LCPI]](%rip), %xmm1 +; CMOV-NEXT: [[TBB1]]: +; CMOV-NEXT: jp [[TBB2:.LBB[0-9_]+]] +; CMOV-NEXT: movaps %xmm1, %xmm0 +; CMOV-NEXT: [[TBB2]]: +; CMOV-NEXT: retq + +; NOCMOV: jne +; NOCMOV: jp +define float @test_zext_fcmp_oeq(float %a, float %b) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %conv1 = zext i1 %cmp to i32 + %conv2 = sitofp i32 %conv1 to float + ret float %conv2 +} + +attributes #0 = { nounwind } Index: test/CodeGen/X86/fast-isel-select-cmov2.ll =================================================================== --- test/CodeGen/X86/fast-isel-select-cmov2.ll +++ test/CodeGen/X86/fast-isel-select-cmov2.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK --check-prefix=FAST ; Test all the cmp predicates that can feed an integer conditional move. @@ -15,10 +15,13 @@ define i64 @select_fcmp_oeq_cmov(double %a, double %b, i64 %c, i64 %d) { ; CHECK-LABEL: select_fcmp_oeq_cmov ; CHECK: ucomisd %xmm1, %xmm0 -; CHECK-NEXT: setnp %al -; CHECK-NEXT: sete %cl -; CHECK-NEXT: testb %al, %cl -; CHECK-NEXT: cmoveq %rsi, %rdi +; SDAG-NEXT: cmovneq %rsi, %rdi +; SDAG-NEXT: cmovpq %rsi, %rdi +; SDAG-NEXT: movq %rdi, %rax +; FAST-NEXT: setnp %al +; FAST-NEXT: sete %cl +; FAST-NEXT: testb %al, %cl +; FAST-NEXT: cmoveq %rsi, %rdi %1 = fcmp oeq double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 @@ -135,10 +138,13 @@ define i64 @select_fcmp_une_cmov(double %a, double %b, i64 %c, i64 %d) { ; CHECK-LABEL: select_fcmp_une_cmov ; CHECK: ucomisd %xmm1, %xmm0 -; CHECK-NEXT: setp %al -; CHECK-NEXT: setne %cl -; CHECK-NEXT: orb %al, %cl -; CHECK-NEXT: cmoveq %rsi, %rdi +; SDAG-NEXT: cmovneq %rdi, %rsi +; SDAG-NEXT: cmovpq %rdi, %rsi +; SDAG-NEXT: movq %rsi, %rax +; FAST-NEXT: setp %al +; FAST-NEXT: setne %cl +; FAST-NEXT: orb %al, %cl +; FAST-NEXT: cmoveq %rsi, %rdi %1 = fcmp une double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2