Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10780,9 +10780,6 @@
       SDValue ExtVal = Extract.getOperand(1);
       unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
       if (Extract.getOperand(0) == VecIn1) {
-        if (ExtIndex > VT.getVectorNumElements())
-          return SDValue();
-
         Mask.push_back(ExtIndex);
         continue;
       }
@@ -10797,25 +10794,52 @@
 
     // We can't generate a shuffle node with mismatched input and output types.
     // Attempt to transform a single input vector to the correct type.
-    if ((VT != VecIn1.getValueType())) {
-      // We don't support shuffeling between TWO values of different types.
+    EVT VecIn1VT = VecIn1.getValueType();
+    if ((VT != VecIn1VT)) {
+      // We don't support shuffling between TWO values of different types.
       if (VecIn2.getNode())
         return SDValue();
+      
+      // If the input vector type has a different base type to the output
+      // vector type, bail out.
+      if (VecIn1VT.getVectorElementType() != VT.getVectorElementType())
+        return SDValue();
 
+      // See if this is a vector extraction from a larger vector.
+      // Ignore single element build vectors (just 1 mask element) because
+      // that's better handled as a scalar, not a vector.
+
+      // TODO: We should be able to allow an aribtrarily larger source vector to
+      // be extracted into a smaller vector, but this may cause silently wrong
+      // codegen in the x86 backend at least. For now, limit the transform to
+      // a simple upper/lower half-size extraction.
+      if (VecIn1VT.getSizeInBits() == (VT.getSizeInBits() * 2) &&
+          Mask.size() > 1) {
+        int StartIdx = Mask[0];
+        bool IsExtract = true;
+        // The mask must specify consecutive elements from the source vector.
+        for (int i = 0, e = Mask.size(); i < e; i++) {
+          if (Mask[i] != (i + StartIdx)) {
+            IsExtract = false;
+            break;
+          }
+        }
+        if (IsExtract)
+          // TODO: See comment above; we should be able to remove this check.
+          if (StartIdx == 0 || StartIdx == (signed) VT.getVectorNumElements()) {
+            SDValue VecIdx = DAG.getIntPtrConstant(StartIdx);
+            return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, VecIdx);
+          }
+      }
+      
       // We only support widening of vectors which are half the size of the
       // output registers. For example XMM->YMM widening on X86 with AVX.
-      if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits())
-        return SDValue();
-
-      // If the input vector type has a different base type to the output
-      // vector type, bail out.
-      if (VecIn1.getValueType().getVectorElementType() !=
-          VT.getVectorElementType())
+      if (VecIn1VT.getSizeInBits() * 2 != VT.getSizeInBits())
         return SDValue();
 
       // Widen the input vector by adding undef values.
       VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                           VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
+                           VecIn1, DAG.getUNDEF(VecIn1VT));
     }
 
     if (UsesZeroVector)
Index: test/CodeGen/X86/vec_extract-avx.ll
===================================================================
--- test/CodeGen/X86/vec_extract-avx.ll
+++ test/CodeGen/X86/vec_extract-avx.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+
+; When extracting multiple consecutive elements from a larger
+; vector into a smaller one, do it efficiently. We should use
+; an EXTRACT_SUBVECTOR node internally rather than a bunch of
+; single element extractions. 
+
+; Extracting the low elements only requires using the right kind of store.
+define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
+  %ext0 = extractelement <8 x float> %v, i32 0
+  %ext1 = extractelement <8 x float> %v, i32 1
+  %ext2 = extractelement <8 x float> %v, i32 2
+  %ext3 = extractelement <8 x float> %v, i32 3
+  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+  store <4 x float> %ins3, <4 x float>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: low_v8f32_to_v4f32
+; CHECK: vmovaps
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Extracting the high elements requires just one AVX instruction. 
+define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
+  %ext0 = extractelement <8 x float> %v, i32 4
+  %ext1 = extractelement <8 x float> %v, i32 5
+  %ext2 = extractelement <8 x float> %v, i32 6
+  %ext3 = extractelement <8 x float> %v, i32 7
+  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+  store <4 x float> %ins3, <4 x float>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: high_v8f32_to_v4f32
+; CHECK: vextractf128
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Make sure element type doesn't alter the codegen. Note that
+; if we were actually using the vector in this function and
+; have AVX2, we should generate vextracti128 (the int version).
+define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
+  %ext0 = extractelement <8 x i32> %v, i32 4
+  %ext1 = extractelement <8 x i32> %v, i32 5
+  %ext2 = extractelement <8 x i32> %v, i32 6
+  %ext3 = extractelement <8 x i32> %v, i32 7
+  %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0
+  %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1
+  %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2
+  %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3
+  store <4 x i32> %ins3, <4 x i32>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: high_v8i32_to_v4i32
+; CHECK: vextractf128
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Make sure that element size doesn't alter the codegen.
+define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
+  %ext0 = extractelement <4 x double> %v, i32 2
+  %ext1 = extractelement <4 x double> %v, i32 3
+  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+  store <2 x double> %ins1, <2 x double>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: high_v4f64_to_v2f64
+; CHECK: vextractf128
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+