Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -120,6 +120,7 @@ /// flag operand produced by a CMP or TEST instruction. It also writes a /// flag result. CMOV, + CMOV2, /// X86 conditional branches. Operand 0 is the chain operand, operand 1 /// is the block to branch if condition is true, operand 2 is the Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -20128,6 +20128,7 @@ case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::CMOV: return "X86ISD::CMOV"; + case X86ISD::CMOV2: return "X86ISD::CMOV2"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; @@ -21087,6 +21088,21 @@ // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); + + // We also lower CMOV2 pseudo-instructions, which are equivalent to: + // (CMOV (CMOV F, T, cc1), T, cc2) + // to two successives branches instead of one. + bool isCMOV2 = MI->getNumOperands() == 6; + MachineBasicBlock *jcc1MBB = nullptr; + + // If MI is a CMOV2 instruction, we lower it to two successive branches to + // the same block. EFLAGS is used by both, so mark it as live in the second. + if (isCMOV2) { + jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, jcc1MBB); + jcc1MBB->addLiveIn(X86::EFLAGS); + } + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, copy0MBB); @@ -21107,7 +21123,19 @@ sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. - BB->addSuccessor(copy0MBB); + if (isCMOV2) { + // The fallthrough block may be jcc1MBB, if MI is a CMOV2 instruction. + BB->addSuccessor(jcc1MBB); + + // In that case, jcc1MBB will itself fallthrough the copy0MBB, and + // jump to the sinkMBB. + jcc1MBB->addSuccessor(copy0MBB); + jcc1MBB->addSuccessor(sinkMBB); + } else { + BB->addSuccessor(copy0MBB); + } + + // The true block target of the first (or only) branch is always sinkMBB. BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. @@ -21115,6 +21143,12 @@ X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); + if (isCMOV2) { + unsigned Opc2 = + X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(4).getImm()); + BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); + } + // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB @@ -21123,10 +21157,16 @@ // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(X86::PHI), MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + MachineInstrBuilder MIB = + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + // If this was a CMOV2 instruction, the second Jcc provides the same incoming + // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). + if (isCMOV2) + MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); MI->eraseFromParent(); // The pseudo instruction is gone now. return sinkMBB; @@ -21679,6 +21719,23 @@ case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: + case X86::CMOV2_GR8: + case X86::CMOV2_FR32: + case X86::CMOV2_FR64: + case X86::CMOV2_V4F32: + case X86::CMOV2_V2F64: + case X86::CMOV2_V2I64: + case X86::CMOV2_V8F32: + case X86::CMOV2_V4F64: + case X86::CMOV2_V4I64: + case X86::CMOV2_V16F32: + case X86::CMOV2_V8F64: + case X86::CMOV2_V8I64: + case X86::CMOV2_GR16: + case X86::CMOV2_GR32: + case X86::CMOV2_RFP32: + case X86::CMOV2_RFP64: + case X86::CMOV2_RFP80: return EmitLoweredSelect(MI, BB); case X86::FP32_TO_INT16_IN_MEM: @@ -23963,6 +24020,48 @@ return SDValue(); } +/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. +/// Match: +/// (X86or (X86setcc) (X86setcc)) +/// (X86cmp (and (X86setcc) (X86setcc)), 0) +static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, + X86::CondCode &CC1, SDValue &Flags, + bool &isAnd) { + SDValue SetCC0, SetCC1; + + if (Cond->getOpcode() == X86ISD::CMP) { + ConstantSDNode *CondOp1C = dyn_cast(Cond->getOperand(1)); + if (!CondOp1C || !CondOp1C->isNullValue()) + return false; + + Cond = Cond->getOperand(0); + } + + switch (Cond->getOpcode()) { + default: return false; + case ISD::AND: + case X86ISD::AND: + isAnd = true; + // fallthru + case ISD::OR: + case X86ISD::OR: + SetCC0 = Cond->getOperand(0); + SetCC1 = Cond->getOperand(1); + break; + }; + + // Make sure we have SETCC nodes, using the same flags value. + if (SetCC0.getOpcode() != X86ISD::SETCC || + SetCC1.getOpcode() != X86ISD::SETCC || + SetCC0->getOperand(1) != SetCC1->getOperand(1)) + return false; + + CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); + CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); + Flags = SetCC0->getOperand(1); + return true; +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -23975,6 +24074,7 @@ SDValue FalseOp = N->getOperand(0); SDValue TrueOp = N->getOperand(1); + EVT VT = TrueOp.getValueType(); X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); @@ -24132,6 +24232,58 @@ } } + // Fold and/or of setcc's to double CMOV: + // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) + // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) + // + // In practice, when we need to lower to control flow using a custom inserter, + // we represent the double CMOV using a special node, CMOV2: + // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV2 F, T, cc1, cc2) + // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV2 T, F, !cc1, !cc2) + // This enables the custom inserter to only need to insert one PHI, instead + // of two if it looked at two independent CMOVs. + // + // This combine lets us generate: + // cmovcc1 (jcc1 if we don't have CMOV) + // cmovcc2 (same) + // instead of: + // setcc1 + // setcc2 + // and/or + // cmovne (jne if we don't have CMOV) + // When we can't use the CMOV instruction, it might increase branch + // mispredicts. + // When we can use CMOV, or when there is no mispredict, this improves + // throughput and reduces register pressure. + // + if (CC == X86::COND_NE) { + SDValue Flags; + X86::CondCode CC0, CC1; + bool isAndSetCC = false; + if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { + if (isAndSetCC) { + std::swap(FalseOp, TrueOp); + CC0 = X86::GetOppositeBranchCondition(CC0); + CC1 = X86::GetOppositeBranchCondition(CC1); + } + + SDValue CMOV; + if (Subtarget->hasCMov() && VT.isInteger() && !VT.isVector()) { + SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8), + Flags}; + SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); + SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, MVT::i8), Flags}; + CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + } else { + SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8), + DAG.getConstant(CC1, MVT::i8), Flags}; + CMOV = DAG.getNode(X86ISD::CMOV2, DL, N->getVTList(), Ops); + } + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); + return CMOV; + } + } + return SDValue(); } Index: lib/Target/X86/X86InstrCompiler.td =================================================================== --- lib/Target/X86/X86InstrCompiler.td +++ lib/Target/X86/X86InstrCompiler.td @@ -475,58 +475,55 @@ //===----------------------------------------------------------------------===// // Conditional Move Pseudo Instructions -// X86 doesn't have 8-bit conditional moves. Use a customInserter to -// emit control flow. An alternative to this is to mark i8 SELECT as Promote, -// however that requires promoting the operands, and can induce additional -// i8 register pressure. +// CMOV* - Used to implement the SELECT DAG operation. Expanded after +// instruction selection into a branch sequence. +multiclass CMOV_CMOV2 { + def CMOV#NAME : I<0, Pseudo, + (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond), + "#CMOV_"#NAME#" PSEUDO!", + [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond, + EFLAGS)))]>; + def CMOV2#NAME : I<0, Pseudo, + (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond, i8imm:$cond2), + "#CMOV2_"#NAME#" PSEUDO!", + [(set RC:$dst, (VT (X86cmov2 RC:$t, RC:$f, imm:$cond, + imm:$cond2, EFLAGS)))]>; +} + let usesCustomInserter = 1, Uses = [EFLAGS] in { -def CMOV_GR8 : I<0, Pseudo, - (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond), - "#CMOV_GR8 PSEUDO!", - [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2, - imm:$cond, EFLAGS))]>; + // X86 doesn't have 8-bit conditional moves. Use a customInserter to + // emit control flow. An alternative to this is to mark i8 SELECT as Promote, + // however that requires promoting the operands, and can induce additional + // i8 register pressure. + defm _GR8 : CMOV_CMOV2; -let Predicates = [NoCMov] in { -def CMOV_GR32 : I<0, Pseudo, - (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond), - "#CMOV_GR32* PSEUDO!", - [(set GR32:$dst, - (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>; -def CMOV_GR16 : I<0, Pseudo, - (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond), - "#CMOV_GR16* PSEUDO!", - [(set GR16:$dst, - (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>; -} // Predicates = [NoCMov] + let Predicates = [NoCMov] in { + defm _GR32 : CMOV_CMOV2; + defm _GR16 : CMOV_CMOV2; + } // Predicates = [NoCMov] -// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no -// SSE1. -let Predicates = [FPStackf32] in -def CMOV_RFP32 : I<0, Pseudo, - (outs RFP32:$dst), - (ins RFP32:$src1, RFP32:$src2, i8imm:$cond), - "#CMOV_RFP32 PSEUDO!", - [(set RFP32:$dst, - (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond, - EFLAGS))]>; -// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no -// SSE2. -let Predicates = [FPStackf64] in -def CMOV_RFP64 : I<0, Pseudo, - (outs RFP64:$dst), - (ins RFP64:$src1, RFP64:$src2, i8imm:$cond), - "#CMOV_RFP64 PSEUDO!", - [(set RFP64:$dst, - (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond, - EFLAGS))]>; -def CMOV_RFP80 : I<0, Pseudo, - (outs RFP80:$dst), - (ins RFP80:$src1, RFP80:$src2, i8imm:$cond), - "#CMOV_RFP80 PSEUDO!", - [(set RFP80:$dst, - (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond, - EFLAGS))]>; -} // UsesCustomInserter = 1, Uses = [EFLAGS] + // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no + // SSE1/SSE2. + let Predicates = [FPStackf32] in + defm _RFP32 : CMOV_CMOV2; + + let Predicates = [FPStackf64] in + defm _RFP64 : CMOV_CMOV2; + + defm _RFP80 : CMOV_CMOV2; + + defm _FR32 : CMOV_CMOV2; + defm _FR64 : CMOV_CMOV2; + defm _V4F32 : CMOV_CMOV2; + defm _V2F64 : CMOV_CMOV2; + defm _V2I64 : CMOV_CMOV2; + defm _V8F32 : CMOV_CMOV2; + defm _V4F64 : CMOV_CMOV2; + defm _V4I64 : CMOV_CMOV2; + defm _V8I64 : CMOV_CMOV2; + defm _V8F64 : CMOV_CMOV2; + defm _V16F32 : CMOV_CMOV2; +} // usesCustomInserter = 1, Uses = [EFLAGS] //===----------------------------------------------------------------------===// @@ -863,79 +860,6 @@ def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), "#ACQUIRE_MOV PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; -//===----------------------------------------------------------------------===// -// Conditional Move Pseudo Instructions. -//===----------------------------------------------------------------------===// - -// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after -// instruction selection into a branch sequence. -let Uses = [EFLAGS], usesCustomInserter = 1 in { - def CMOV_FR32 : I<0, Pseudo, - (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond), - "#CMOV_FR32 PSEUDO!", - [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond, - EFLAGS))]>; - def CMOV_FR64 : I<0, Pseudo, - (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond), - "#CMOV_FR64 PSEUDO!", - [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond, - EFLAGS))]>; - def CMOV_V4F32 : I<0, Pseudo, - (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - "#CMOV_V4F32 PSEUDO!", - [(set VR128:$dst, - (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V2F64 : I<0, Pseudo, - (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - "#CMOV_V2F64 PSEUDO!", - [(set VR128:$dst, - (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V2I64 : I<0, Pseudo, - (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - "#CMOV_V2I64 PSEUDO!", - [(set VR128:$dst, - (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V8F32 : I<0, Pseudo, - (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - "#CMOV_V8F32 PSEUDO!", - [(set VR256:$dst, - (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V4F64 : I<0, Pseudo, - (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - "#CMOV_V4F64 PSEUDO!", - [(set VR256:$dst, - (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V4I64 : I<0, Pseudo, - (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - "#CMOV_V4I64 PSEUDO!", - [(set VR256:$dst, - (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V8I64 : I<0, Pseudo, - (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - "#CMOV_V8I64 PSEUDO!", - [(set VR512:$dst, - (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V8F64 : I<0, Pseudo, - (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - "#CMOV_V8F64 PSEUDO!", - [(set VR512:$dst, - (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V16F32 : I<0, Pseudo, - (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - "#CMOV_V16F32 PSEUDO!", - [(set VR512:$dst, - (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, - EFLAGS)))]>; -} - //===----------------------------------------------------------------------===// // DAG Pattern Matching Rules Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -30,6 +30,11 @@ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; +def SDTX86Cmov2 : SDTypeProfile<1, 5, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisSameAs<3, 4>, + SDTCisVT<5, i32>]>; + // Unary and binary operator instructions that set EFLAGS as a side-effect. def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, [SDTCisSameAs<0, 2>, @@ -133,6 +138,7 @@ def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; +def X86cmov2 : SDNode<"X86ISD::CMOV2", SDTX86Cmov2>; def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, [SDNPHasChain]>; def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; Index: test/CodeGen/X86/cmovcmov.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/cmovcmov.ll @@ -0,0 +1,226 @@ +; RUN: llc < %s -asm-verbose=false -mtriple=x86_64-unknown-linux | FileCheck %s --check-prefix=CHECK --check-prefix=CMOV +; RUN: llc < %s -asm-verbose=false -mtriple=i686-unknown-linux | FileCheck %s --check-prefix=CHECK --check-prefix=NOCMOV + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Test 2xCMOV patterns exposed after legalization. +; One way to do that is with (select (fcmp une/oeq)), which gets +; legalized to setp/setne. + +; CHECK-LABEL: test_select_fcmp_oeq_i32: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: cmovnel %esi, %edi +; CMOV-NEXT: cmovpl %esi, %edi +; CMOV-NEXT: movl %edi, %eax +; CMOV-NEXT: retq + +; NOCMOV-NEXT: flds 8(%esp) +; NOCMOV-NEXT: flds 4(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 16(%esp), %eax +; NOCMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; NOCMOV-NEXT: jp [[TBB]] +; NOCMOV-NEXT: leal 12(%esp), %eax +; NOCMOV-NEXT:[[TBB]]: +; NOCMOV-NEXT: movl (%eax), %eax +; NOCMOV-NEXT: retl +define i32 @test_select_fcmp_oeq_i32(float %a, float %b, i32 %c, i32 %d) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %r = select i1 %cmp, i32 %c, i32 %d + ret i32 %r +} + +; CHECK-LABEL: test_select_fcmp_oeq_i64: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: cmovneq %rsi, %rdi +; CMOV-NEXT: cmovpq %rsi, %rdi +; CMOV-NEXT: movq %rdi, %rax +; CMOV-NEXT: retq + +; NOCMOV-NEXT: flds 8(%esp) +; NOCMOV-NEXT: flds 4(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 20(%esp), %ecx +; NOCMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; NOCMOV-NEXT: jp [[TBB]] +; NOCMOV-NEXT: leal 12(%esp), %ecx +; NOCMOV-NEXT: [[TBB]]: +; NOCMOV-NEXT: movl (%ecx), %eax +; NOCMOV-NEXT: orl $4, %ecx +; NOCMOV-NEXT: movl (%ecx), %edx +; NOCMOV-NEXT: retl +define i64 @test_select_fcmp_oeq_i64(float %a, float %b, i64 %c, i64 %d) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %r = select i1 %cmp, i64 %c, i64 %d + ret i64 %r +} + +; CHECK-LABEL: test_select_fcmp_une_i64: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: cmovneq %rdi, %rsi +; CMOV-NEXT: cmovpq %rdi, %rsi +; CMOV-NEXT: movq %rsi, %rax +; CMOV-NEXT: retq + +; NOCMOV-NEXT: flds 8(%esp) +; NOCMOV-NEXT: flds 4(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 12(%esp), %ecx +; NOCMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; NOCMOV-NEXT: jp [[TBB]] +; NOCMOV-NEXT: leal 20(%esp), %ecx +; NOCMOV-NEXT: [[TBB]]: +; NOCMOV-NEXT: movl (%ecx), %eax +; NOCMOV-NEXT: orl $4, %ecx +; NOCMOV-NEXT: movl (%ecx), %edx +; NOCMOV-NEXT: retl +define i64 @test_select_fcmp_une_i64(float %a, float %b, i64 %c, i64 %d) #0 { +entry: + %cmp = fcmp une float %a, %b + %r = select i1 %cmp, i64 %c, i64 %d + ret i64 %r +} + +; CHECK-LABEL: test_select_fcmp_oeq_f64: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; CMOV-NEXT: jp [[TBB]] +; CMOV-NEXT: movaps %xmm2, %xmm3 +; CMOV-NEXT: [[TBB]]: +; CMOV-NEXT: movaps %xmm3, %xmm0 +; CMOV-NEXT: retq + +; NOCMOV-NEXT: flds 8(%esp) +; NOCMOV-NEXT: flds 4(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 20(%esp), %eax +; NOCMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; NOCMOV-NEXT: jp [[TBB]] +; NOCMOV-NEXT: leal 12(%esp), %eax +; NOCMOV-NEXT: [[TBB]]: +; NOCMOV-NEXT: fldl (%eax) +; NOCMOV-NEXT: retl +define double @test_select_fcmp_oeq_f64(float %a, float %b, double %c, double %d) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %r = select i1 %cmp, double %c, double %d + ret double %r +} + +; CHECK-LABEL: test_select_fcmp_oeq_v4i32: + +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; CMOV-NEXT: jp [[TBB]] +; CMOV-NEXT: movaps %xmm2, %xmm3 +; CMOV-NEXT: [[TBB]]: +; CMOV-NEXT: movaps %xmm3, %xmm0 +; CMOV-NEXT: retq + +; NOCMOV-NEXT: pushl %edi +; NOCMOV-NEXT: pushl %esi +; NOCMOV-NEXT: flds 20(%esp) +; NOCMOV-NEXT: flds 16(%esp) +; NOCMOV-NEXT: fucompp +; NOCMOV-NEXT: fnstsw %ax +; NOCMOV-NEXT: sahf +; NOCMOV-NEXT: leal 40(%esp), %eax +; NOCMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; NOCMOV-NEXT: jp [[TBB]] +; NOCMOV-NEXT: leal 24(%esp), %eax +; NOCMOV-NEXT: [[TBB]]: +; NOCMOV-NEXT: movl (%eax), %eax +; NOCMOV-NEXT: leal 44(%esp), %ecx +; NOCMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; NOCMOV-NEXT: jp [[TBB]] +; NOCMOV-NEXT: leal 28(%esp), %ecx +; NOCMOV-NEXT: [[TBB]]: +; NOCMOV-NEXT: movl (%ecx), %ecx +; NOCMOV-NEXT: leal 48(%esp), %esi +; NOCMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; NOCMOV-NEXT: jp [[TBB]] +; NOCMOV-NEXT: leal 32(%esp), %esi +; NOCMOV-NEXT: [[TBB]]: +; NOCMOV-NEXT: movl 12(%esp), %edx +; NOCMOV-NEXT: movl (%esi), %esi +; NOCMOV-NEXT: leal 52(%esp), %edi +; NOCMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; NOCMOV-NEXT: jp [[TBB]] +; NOCMOV-NEXT: leal 36(%esp), %edi +; NOCMOV-NEXT: [[TBB]]: +; NOCMOV-NEXT: movl (%edi), %edi +; NOCMOV-NEXT: movl %edi, 12(%edx) +; NOCMOV-NEXT: movl %esi, 8(%edx) +; NOCMOV-NEXT: movl %ecx, 4(%edx) +; NOCMOV-NEXT: movl %eax, (%edx) +; NOCMOV-NEXT: popl %esi +; NOCMOV-NEXT: popl %edi +; NOCMOV-NEXT: retl $4 +define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <4 x i32> %d) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %r = select i1 %cmp, <4 x i32> %c, <4 x i32> %d + ret <4 x i32> %r +} + +; Also make sure we catch the original code-sequence of interest: + +; CMOV: [[ONE_F32_LCPI:.LCPI.*]]: +; CMOV-NEXT: .long 1065353216 + +; CHECK-LABEL: test_zext_fcmp_une: +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: movss [[ONE_F32_LCPI]](%rip), %xmm0 +; CMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; CMOV-NEXT: jp [[TBB]] +; CMOV-NEXT: xorps %xmm0, %xmm0 +; CMOV-NEXT: [[TBB]]: +; CMOV-NEXT: retq + +; NOCMOV: jne +; NOCMOV-NEXT: jp +define float @test_zext_fcmp_une(float %a, float %b) #0 { +entry: + %cmp = fcmp une float %a, %b + %conv1 = zext i1 %cmp to i32 + %conv2 = sitofp i32 %conv1 to float + ret float %conv2 +} + +; CMOV: [[ONE_F32_LCPI:.LCPI.*]]: +; CMOV-NEXT: .long 1065353216 + +; CHECK-LABEL: test_zext_fcmp_oeq: +; CMOV-NEXT: ucomiss %xmm1, %xmm0 +; CMOV-NEXT: xorps %xmm0, %xmm0 +; CMOV-NEXT: jne [[TBB:.LBB[0-9_]+]] +; CMOV-NEXT: jp [[TBB]] +; CMOV-NEXT: movss [[ONE_F32_LCPI]](%rip), %xmm0 +; CMOV-NEXT: [[TBB]]: +; CMOV-NEXT: retq + +; NOCMOV: jne +; NOCMOV-NEXT: jp +define float @test_zext_fcmp_oeq(float %a, float %b) #0 { +entry: + %cmp = fcmp oeq float %a, %b + %conv1 = zext i1 %cmp to i32 + %conv2 = sitofp i32 %conv1 to float + ret float %conv2 +} + +attributes #0 = { nounwind } Index: test/CodeGen/X86/fast-isel-select-cmov2.ll =================================================================== --- test/CodeGen/X86/fast-isel-select-cmov2.ll +++ test/CodeGen/X86/fast-isel-select-cmov2.ll @@ -15,10 +15,9 @@ define i64 @select_fcmp_oeq_cmov(double %a, double %b, i64 %c, i64 %d) { ; CHECK-LABEL: select_fcmp_oeq_cmov ; CHECK: ucomisd %xmm1, %xmm0 -; CHECK-NEXT: setnp %al -; CHECK-NEXT: sete %cl -; CHECK-NEXT: testb %al, %cl -; CHECK-NEXT: cmoveq %rsi, %rdi +; CHECK-NEXT: cmovneq %rsi, %rdi +; CHECK-NEXT: cmovpq %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax %1 = fcmp oeq double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 @@ -135,10 +134,9 @@ define i64 @select_fcmp_une_cmov(double %a, double %b, i64 %c, i64 %d) { ; CHECK-LABEL: select_fcmp_une_cmov ; CHECK: ucomisd %xmm1, %xmm0 -; CHECK-NEXT: setp %al -; CHECK-NEXT: setne %cl -; CHECK-NEXT: orb %al, %cl -; CHECK-NEXT: cmoveq %rsi, %rdi +; CHECK-NEXT: cmovneq %rdi, %rsi +; CHECK-NEXT: cmovpq %rdi, %rsi +; CHECK-NEXT: movq %rsi, %rax %1 = fcmp une double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2