Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -22490,6 +22490,7 @@
 }
 
 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+/// FIXME: This could be expanded to support 512 bit vectors as well.
 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const X86Subtarget* Subtarget) {
@@ -22499,6 +22500,7 @@
   SDValue V2 = SVOp->getOperand(1);
   MVT VT = SVOp->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
+  unsigned HalfNumElems = NumElems / 2;
 
   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
       V2.getOpcode() == ISD::CONCAT_VECTORS) {
@@ -22523,9 +22525,9 @@
     // To match the shuffle mask, the first half of the mask should
     // be exactly the first vector, and all the rest a splat with the
     // first element of the second one.
-    for (unsigned i = 0; i != NumElems/2; ++i)
+    for (unsigned i = 0; i != HalfNumElems; ++i)
       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
-          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+          !isUndefOrEqual(SVOp->getMaskElt(i+ HalfNumElems), NumElems))
         return SDValue();
 
     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
@@ -22569,7 +22571,7 @@
 
   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   if (isShuffleHigh128VectorInsertLow(SVOp)) {
-    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+    SDValue V = Extract128BitVector(V1, HalfNumElems, DAG, dl);
     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
     return DCI.CombineTo(N, InsV);
   }
@@ -22577,10 +22579,40 @@
   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   if (isShuffleLow128VectorInsertHigh(SVOp)) {
     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, HalfNumElems, DAG, dl);
     return DCI.CombineTo(N, InsV);
   }
 
+  // vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
+  if (isUndefInRange(SVOp->getMask(), HalfNumElems, HalfNumElems)) {
+    // If the shuffle only uses the lower halves of the inputs,
+    // then extract them and perform the 'half' shuffle.
+    bool AllLowerHalf = true;
+    SmallVector<int, 8> HalfMask;
+    for (unsigned i = 0; i != HalfNumElems; ++i) {
+      int M = SVOp->getMaskElt(i);
+      if (M < 0) {
+        HalfMask.push_back(M);
+        continue;
+      }
+      AllLowerHalf &= (M % NumElems) < HalfNumElems;
+      if (M >= (int)NumElems) {
+        HalfMask.push_back((M % NumElems) + HalfNumElems);
+        continue;
+      }
+      HalfMask.push_back(M);
+    }
+
+    if (AllLowerHalf) {
+      MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElems);
+      SDValue Half1 = Extract128BitVector(V1, 0, DAG, dl);
+      SDValue Half2 = Extract128BitVector(V2, 0, DAG, dl);
+      SDValue V = DAG.getVectorShuffle(HalfVT, dl, Half1, Half2, HalfMask);
+      SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
+      return DCI.CombineTo(N, InsV);
+    }
+  }
+
   return SDValue();
 }
 
@@ -26349,7 +26381,7 @@
 
   // If we're negating a FMUL node on a target with FMA, then we can avoid the
   // use of a constant by performing (-0 - A*B) instead.
-  // FIXME: Check rounding control flags as well once it becomes available. 
+  // FIXME: Check rounding control flags as well once it becomes available.
   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
       Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
Index: test/CodeGen/X86/vector-shuffle-256-v16.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3284,6 +3284,15 @@
   ret <16 x i16> %shuffle
 }
 
+define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
+; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i16> %shuffle
+}
+
 define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
 ; AVX1:       # BB#0:
Index: test/CodeGen/X86/vector-shuffle-256-v32.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2013,6 +2013,15 @@
   ret <32 x i8> %shuffle
 }
 
+define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
+; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i8> %shuffle
+}
+
 define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
 ; AVX1:       # BB#0:
Index: test/CodeGen/X86/vector-shuffle-256-v4.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -491,7 +491,7 @@
 define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: shuffle_v4f64_11uu:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; ALL-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
   ret <4 x double> %shuffle
@@ -1135,20 +1135,10 @@
 }
 
 define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {
-; AVX1-LABEL: shuffle_v4i64_11uu:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v4i64_11uu:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i64_11uu:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: shuffle_v4i64_11uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; ALL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
   ret <4 x i64> %shuffle
 }
Index: test/CodeGen/X86/vector-shuffle-256-v8.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -864,6 +864,15 @@
   ret <8 x float> %shuffle
 }
 
+define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_1111uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
 define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
 ; AVX1-LABEL: shuffle_v8f32_5555uuuu:
 ; AVX1:       # BB#0:
@@ -1961,6 +1970,15 @@
   ret <8 x i32> %shuffle
 }
 
+define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) {
+; ALL-LABEL: shuffle_v8i32_2222uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
 define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_44444444:
 ; AVX1:       # BB#0: