Index: llvm/trunk/lib/Target/X86/X86FastISel.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86FastISel.cpp +++ llvm/trunk/lib/Target/X86/X86FastISel.cpp @@ -1810,11 +1810,11 @@ return true; } -/// \brief Emit SSE instructions to lower the select. +/// \brief Emit SSE or AVX instructions to lower the select. /// /// Try to use SSE1/SSE2 instructions to simulate a select without branches. /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary -/// SSE instructions are available. +/// SSE instructions are available. If AVX is available, try to use a VBLENDV. bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have @@ -1850,19 +1850,17 @@ if (NeedSwap) std::swap(CmpLHS, CmpRHS); - static unsigned OpcTable[2][2][4] = { - { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, - { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, - { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, - { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } + // Choose the SSE instruction sequence based on data type (float or double). + static unsigned OpcTable[2][4] = { + { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr } }; - bool HasAVX = Subtarget->hasAVX(); unsigned *Opc = nullptr; switch (RetVT.SimpleTy) { default: return false; - case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; - case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; + case MVT::f32: Opc = &OpcTable[0][0]; break; + case MVT::f64: Opc = &OpcTable[1][0]; break; } const Value *LHS = I->getOperand(1); @@ -1884,14 +1882,33 @@ return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); - unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, - CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, - LHSReg, LHSIsKill); - unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, - RHSReg, RHSIsKill); - unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, - AndReg, /*IsKill=*/true); + unsigned ResultReg; + + if (Subtarget->hasAVX()) { + // If we have AVX, create 1 blendv instead of 3 logic instructions. + // Blendv was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + unsigned CmpOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; + unsigned BlendOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; + + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CmpReg, true); + } else { + unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + } updateValueMap(I, ResultReg); return true; } Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -13271,9 +13271,9 @@ EVT VT = Op1.getValueType(); SDValue CC; - // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops - // are available. Otherwise fp cmovs get lowered into a less efficient branch - // sequence later on. + // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops + // are available or VBLENDV if AVX is available. + // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget->hasSSE1() && VT == MVT::f32)) && @@ -13288,8 +13288,42 @@ DAG.getConstant(SSECC, MVT::i8)); return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); } + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, MVT::i8)); + + // If we have AVX, we can use a variable vector select (VBLENDV) instead + // of 3 logic instructions for size savings and potentially speed. + // Unfortunately, there is no scalar form of VBLENDV. + + // If either operand is a constant, don't try this. We can expect to + // optimize away at least one of the logic instructions later in that + // case, so that sequence would be faster than a variable blend. + + // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + + if (Subtarget->hasAVX() && + !isa(Op1) && !isa(Op2)) { + + // Convert to vectors, do a VSELECT, and convert back to scalar. + // All of the conversions should be optimized away. + + EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; + SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); + SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); + SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); + + EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; + VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp); + + SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + VSel, DAG.getIntPtrConstant(0)); + } SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); Index: llvm/trunk/test/CodeGen/X86/fast-isel-select-sse.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fast-isel-select-sse.ll +++ llvm/trunk/test/CodeGen/X86/fast-isel-select-sse.ll @@ -13,9 +13,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_oeq_f32 ; AVX: vcmpeqss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp oeq float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -29,9 +27,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_oeq_f64 ; AVX: vcmpeqsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp oeq double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -45,9 +41,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm1 ; AVX-LABEL: select_fcmp_ogt_f32 ; AVX: vcmpltss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ogt float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -61,9 +55,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm1 ; AVX-LABEL: select_fcmp_ogt_f64 ; AVX: vcmpltsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ogt double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -77,9 +69,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm1 ; AVX-LABEL: select_fcmp_oge_f32 ; AVX: vcmpless %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp oge float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -93,9 +83,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm1 ; AVX-LABEL: select_fcmp_oge_f64 ; AVX: vcmplesd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp oge double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -109,9 +97,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_olt_f32 ; AVX: vcmpltss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp olt float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -125,9 +111,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_olt_f64 ; AVX: vcmpltsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp olt double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -141,9 +125,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_ole_f32 ; AVX: vcmpless %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ole float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -157,9 +139,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_ole_f64 ; AVX: vcmplesd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ole double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -173,9 +153,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_ord_f32 ; AVX: vcmpordss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ord float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -189,9 +167,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_ord_f64 ; AVX: vcmpordsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ord double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -205,9 +181,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_uno_f32 ; AVX: vcmpunordss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp uno float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -221,9 +195,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_uno_f64 ; AVX: vcmpunordsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp uno double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -237,9 +209,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_ugt_f32 ; AVX: vcmpnless %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ugt float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -253,9 +223,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_ugt_f64 ; AVX: vcmpnlesd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ugt double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -269,9 +237,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_uge_f32 ; AVX: vcmpnltss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp uge float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -285,9 +251,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_uge_f64 ; AVX: vcmpnltsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp uge double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -301,9 +265,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm1 ; AVX-LABEL: select_fcmp_ult_f32 ; AVX: vcmpnless %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ult float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -317,9 +279,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm1 ; AVX-LABEL: select_fcmp_ult_f64 ; AVX: vcmpnlesd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ult double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -333,9 +293,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm1 ; AVX-LABEL: select_fcmp_ule_f32 ; AVX: vcmpnltss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ule float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -349,9 +307,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm1 ; AVX-LABEL: select_fcmp_ule_f64 ; AVX: vcmpnltsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp ule double %a, %b %2 = select i1 %1, double %c, double %d ret double %2 @@ -365,9 +321,7 @@ ; CHECK-NEXT: orps %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_une_f32 ; AVX: vcmpneqss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp une float %a, %b %2 = select i1 %1, float %c, float %d ret float %2 @@ -381,9 +335,7 @@ ; CHECK-NEXT: orpd %xmm2, %xmm0 ; AVX-LABEL: select_fcmp_une_f64 ; AVX: vcmpneqsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm1 -; AVX-NEXT: vandnpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 %1 = fcmp une double %a, %b %2 = select i1 %1, double %c, double %d ret double %2