Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
@@ -160,8 +160,51 @@
 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering INSERT_VECTOR_ELT operations easier.
 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
-                                  SelectionDAG &DAG,SDLoc dl) {
+                                  SelectionDAG &DAG, SDLoc dl) {
   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+  // For insertion into the zero index (low half) of a 256-bit vector, it is
+  // more efficient to generate a blend with immediate instead of an insert*128.
+  // We are still creating an INSERT_SUBVECTOR below with an undef node to
+  // extend the subvector to the size of the result vector. Make sure that
+  // we are not recursing on that node by checking for undef here.
+  if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+      Result.getOpcode() != ISD::UNDEF) {
+    EVT ResultVT = Result.getValueType();
+    SDValue ZeroIndex = DAG.getIntPtrConstant(0);
+    SDValue Undef = DAG.getUNDEF(ResultVT);
+    SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+                               Vec, ZeroIndex);
+
+    // The blend instruction, and therefore its mask, depend on the data type.
+    MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
+    if (ScalarType.isFloatingPoint()) {
+      // Choose either vblendps (float) or vblendpd (double).
+      unsigned ScalarSize = ScalarType.getSizeInBits();
+      assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+      unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+      SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
+      return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+    }
+    
+    const X86Subtarget &Subtarget =
+      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+    // AVX2 is needed for 256-bit integer blend support.
+    // Integers must be cast to 32-bit because there is only vpblendd;
+    // vpblendw can't be used for this because it has a handicapped mask.
+
+    // If we don't have AVX2, then cast to float. Using a wrong domain blend
+    // is still more efficient than using the wrong domain vinsertf128 that
+    // will be created by InsertSubVector().
+    MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+    SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
+    Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
+    Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+    return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
+  }
+
   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 
Index: llvm/trunk/test/CodeGen/X86/2012-04-26-sdglue.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ llvm/trunk/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,10 +5,10 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK-LABEL: func:
-;CHECK: vpxor
-;CHECK: vinserti128
+;CHECK: vxorps
 ;CHECK: vpshufd
 ;CHECK: vpbroadcastd
+;CHECK: vinserti128
 ;CHECK: vmulps
 ;CHECK: vmulps
 ;CHECK: ret
Index: llvm/trunk/test/CodeGen/X86/avx-cast.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/avx-cast.ll
+++ llvm/trunk/test/CodeGen/X86/avx-cast.ll
@@ -1,51 +1,100 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx  | FileCheck %s --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+; Prefer a blend instruction to a vinsert128 instruction because blends
+; are simpler (no lane changes) and therefore will have equal or better
+; performance.
 
-; CHECK-LABEL: castA:
-; CHECK: vxorps
-; CHECK-NEXT: vinsertf128 $0
 define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castA:
+; AVX1:         vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: castA:
+; AVX2:         vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+
 entry:
   %shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
   ret <8 x float> %shuffle.i
 }
 
-; CHECK-LABEL: castB:
-; CHECK: vxorps
-; CHECK-NEXT: vinsertf128 $0
 define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castB:
+; AVX1:         vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: castB:
+; AVX2:         vxorpd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT:    retq
+
 entry:
   %shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   ret <4 x double> %shuffle.i
 }
 
-; CHECK-LABEL: castC:
-; CHECK: vxorps
-; CHECK-NEXT: vinsertf128 $0
+; AVX2 is needed for integer types.
+
 define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castC:
+; AVX1:         vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: castC:
+; AVX2:         vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+
 entry:
   %shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   ret <4 x i64> %shuffle.i
 }
 
-; CHECK-LABEL: castD:
-; CHECK-NOT: vextractf128 $0
+; The next three tests don't need any shuffling. There may or may not be a
+; vzeroupper before the return, so just check for the absence of shuffles.
+
 define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castD:
+; AVX1-NOT:    extract
+; AVX1-NOT:    blend
+;
+; AVX2-LABEL: castD:
+; AVX2-NOT:    extract
+; AVX2-NOT:    blend
+
 entry:
   %shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %shuffle.i
 }
 
-; CHECK-LABEL: castE:
-; CHECK-NOT: vextractf128 $0
 define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castE:
+; AVX1-NOT:    extract
+; AVX1-NOT:    blend
+;
+; AVX2-LABEL: castE:
+; AVX2-NOT:    extract
+; AVX2-NOT:    blend
+
 entry:
   %shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %shuffle.i
 }
 
-; CHECK-LABEL: castF:
-; CHECK-NOT: vextractf128 $0
 define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castF:
+; AVX1-NOT:    extract
+; AVX1-NOT:    blend
+;
+; AVX2-LABEL: castF:
+; AVX2-NOT:    extract
+; AVX2-NOT:    blend
+
 entry:
   %shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %shuffle.i
Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -652,12 +652,12 @@
 ;
 ; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    movl $15, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vinserti128 $0, %xmm1, %ymm2, %ymm1
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd $15, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>