Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7446,6 +7446,17 @@ return DAG.getConstant(Imm, MVT::i8); } +/// \brief Helper to test for a load that can be folded with x86 shuffles. +/// +/// This is particularly important because the set of instructions varies +/// significantly based on whether the operand is a load or not. +static bool isShuffleFoldableLoad(SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + return ISD::isNON_EXTLoad(V.getNode()); +} + /// \brief Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending @@ -7470,6 +7481,13 @@ } switch (VT.SimpleTy) { case MVT::v2f64: + // Canonicalize this particular blend because it saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + if (BlendMask == 2 && !isShuffleFoldableLoad(V1)) { + std::swap(V1, V2); + BlendMask = 1; + } + //FALLTHROUGH case MVT::v4f32: case MVT::v4f64: case MVT::v8f32: @@ -8203,17 +8221,6 @@ return SDValue(); } -/// \brief Helper to test for a load that can be folded with x86 shuffles. -/// -/// This is particularly important because the set of instructions varies -/// significantly based on whether the operand is a load or not. -static bool isShuffleFoldableLoad(SDValue V) { - while (V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - - return ISD::isNON_EXTLoad(V.getNode()); -} - /// \brief Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -3161,8 +3161,7 @@ // addss %xmm1, %xmm0 // TODO: Some canonicalization in lowering would simplify the number of -// patterns we have to try to match. In particular, the reversed order blends -// seem unnecessary. +// patterns we have to try to match. multiclass scalar_math_f32_patterns { let Predicates = [UseSSE1] in { // extracted scalar math op with insert via movss @@ -3263,16 +3262,9 @@ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - - // vector math op with insert via blend (reversed order) - def : Pat<(v2f64 (X86Blendi - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; } - // Repeat everything for AVX and add one more pattern - // (the scalar + blend reversed order) for good measure. + // Repeat everything for AVX. let Predicates = [HasAVX] in { // extracted scalar math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector @@ -3288,13 +3280,6 @@ (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - // extracted scalar math op with insert via blend (reversed order) - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - // vector math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), @@ -3304,12 +3289,6 @@ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - - // vector math op with insert via blend (reversed order) - def : Pat<(v2f64 (X86Blendi - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; } }