Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -9889,10 +9889,7 @@ V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); - // This is essentially a special case blend operation, but if we have - // general purpose blend operations, they are always faster. Bail and let - // the rest of the lowering handle these as blends. - if (Subtarget.hasSSE41()) + if (!VT.is128BitVector()) return SDValue(); // Otherwise, use MOVSD or MOVSS. Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -9732,23 +9732,11 @@ (!cast("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32X:$src, VR128X))>; - // extracted scalar math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))), - FR32X:$src))), (i8 1))), - (!cast("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32X:$src, VR128X))>; - // vector math op with insert via movss def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))), (!cast("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>; - // vector math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), - (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)), (i8 1))), - (!cast("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>; - // extracted masked scalar math op with insert via movss def : Pat<(X86Movss (v4f32 VR128X:$src1), (scalar_to_vector @@ -9776,23 +9764,11 @@ (!cast("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64X:$src, VR128X))>; - // extracted scalar math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))), - FR64X:$src))), (i8 1))), - (!cast("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64X:$src, VR128X))>; - // vector math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))), (!cast("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>; - // vector math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), - (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)), (i8 1))), - (!cast("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>; - // extracted masked scalar math op with insert via movss def : Pat<(X86Movsd (v2f64 VR128X:$src1), (scalar_to_vector Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -2911,22 +2911,6 @@ (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; } - // With SSE 4.1, blendi is preferred to movsd, so match that too. - let Predicates = [UseSSE41] in { - // extracted scalar math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - - // vector math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (!cast(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>; - - } - // Repeat everything for AVX. let Predicates = [UseAVX] in { // extracted scalar math op with insert via movss @@ -2936,22 +2920,10 @@ (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - // extracted scalar math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - // vector math op with insert via movss def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; - - // vector math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; } } @@ -2975,21 +2947,6 @@ (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; } - // With SSE 4.1, blendi is preferred to movsd, so match those too. - let Predicates = [UseSSE41] in { - // extracted scalar math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - - // vector math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - } - // Repeat everything for AVX. let Predicates = [UseAVX] in { // extracted scalar math op with insert via movsd @@ -2999,22 +2956,10 @@ (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - // extracted scalar math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - // vector math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - - // vector math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; } } @@ -3301,19 +3246,10 @@ (!cast(OpcPrefix#r_Int) VT:$dst, VT:$src)>; } - // With SSE 4.1, blendi is preferred to movs*, so match that too. - let Predicates = [UseSSE41] in { - def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), - (!cast(OpcPrefix#r_Int) VT:$dst, VT:$src)>; - } - // Repeat for AVX versions of the instructions. let Predicates = [HasAVX] in { def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), (!cast("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; - - def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), - (!cast("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; } } Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1220,8 +1220,8 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] -; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ;