Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11877,6 +11877,89 @@
       return V;
   }
 
+  // If this shuffle only has a single input that is a bitcasted shuffle,
+  // attempt to merge the 2 shuffles and suitably bitcast the inputs/output
+  // back to their original types.
+  if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
+      N1.getOpcode() == ISD::UNDEF && Level < AfterLegalizeVectorOps &&
+      TLI.isTypeLegal(VT)) {
+
+    // Peek through the bitcast only if there is one user.
+    SDValue BC0 = N0;
+    while (BC0.getOpcode() == ISD::BITCAST) {
+      if (!BC0.hasOneUse())
+        break;
+      BC0 = BC0.getOperand(0);
+    }
+
+    auto ScaleShuffleMask = [](ArrayRef<int> Mask, int Scale) {
+      if (Scale == 1)
+        return SmallVector<int, 8>(Mask.begin(), Mask.end());
+
+      SmallVector<int, 8> NewMask;
+      for (int M : Mask)
+        for (int s = 0; s != Scale; ++s)
+          NewMask.push_back(M < 0 ? -1 : Scale * M + s);
+      return NewMask;
+    };
+
+    if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
+      EVT SVT = VT.getScalarType();
+      EVT InnerVT = BC0->getValueType(0);
+      EVT InnerSVT = InnerVT.getScalarType();
+
+      // Determine which shuffle works with the smaller scalar type.
+      EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
+      EVT ScaleSVT = ScaleVT.getScalarType();
+
+      if (TLI.isTypeLegal(ScaleVT) &&
+          0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
+          0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
+
+        int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
+        int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();
+
+        // Scale the shuffle masks to the smaller scalar type.
+        ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
+        SmallVector<int, 8> InnerMask =
+            ScaleShuffleMask(InnerSVN->getMask(), InnerScale);
+        SmallVector<int, 8> OuterMask =
+            ScaleShuffleMask(SVN->getMask(), OuterScale);
+
+        // Merge the shuffle masks.
+        SmallVector<int, 8> NewMask;
+        for (int M : OuterMask)
+          NewMask.push_back(M < 0 ? -1 : InnerMask[M]);
+
+        // Test for shuffle mask legality over both commutations.
+        SDValue SV0 = BC0->getOperand(0);
+        SDValue SV1 = BC0->getOperand(1);
+        bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
+        if (!LegalMask) {
+          for (int i = 0, e = (int)NewMask.size(); i != e; ++i) {
+            int idx = NewMask[i];
+            if (idx < 0)
+              continue;
+            else if (idx < e)
+              NewMask[i] = idx + e;
+            else
+              NewMask[i] = idx - e;
+          }
+          std::swap(SV0, SV1);
+          LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
+        }
+
+        if (LegalMask) {
+          SV0 = DAG.getNode(ISD::BITCAST, SDLoc(N), ScaleVT, SV0);
+          SV1 = DAG.getNode(ISD::BITCAST, SDLoc(N), ScaleVT, SV1);
+          return DAG.getNode(
+              ISD::BITCAST, SDLoc(N), VT,
+              DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
+        }
+      }
+    }
+  }
+
   // Canonicalize shuffles according to rules:
   //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
   //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
Index: llvm/trunk/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll
+++ llvm/trunk/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mtriple=x86_64-pc-win32 | FileCheck %s
-
-; CHECK: test
-; CHECK: vpmovzxwd
-; CHECK: vpmovzxwd
-define void @test(<4 x i64> %a, <4 x i16>* %buf) {
-  %ex1 = extractelement <4 x i64> %a, i32 0
-  %ex2 = extractelement <4 x i64> %a, i32 1
-  %x1 = bitcast i64 %ex1 to <4 x i16>
-  %x2 = bitcast i64 %ex2 to <4 x i16>
-  %Sh = shufflevector <4 x i16> %x1, <4 x i16> %x2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  store <4 x i16> %Sh, <4 x i16>* %buf, align 1
-  ret void
-}
Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1336,3 +1336,22 @@
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
   ret <16 x i8> %shuffle
 }
+
+define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
+; AVX:       # BB#0:
+; AVX-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %shuffle8  = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
+  %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float>
+  %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16>
+  %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %bitcast8  = bitcast <8 x i16> %shuffle16 to <16 x i8>
+  ret <16 x i8> %bitcast8
+}
Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -810,6 +810,25 @@
   ret <2 x double> %shuffle
 }
 
+define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_bitcast_1z:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_bitcast_1z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; AVX-NEXT:    retq
+  %shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x float>
+  %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  %bitcast64 = bitcast <4 x float> %shuffle32 to <2 x double>
+  ret <2 x double> %bitcast64
+}
+
 define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) {
 ; SSE-LABEL: insert_reg_and_zero_v2i64:
 ; SSE:       # BB#0:
Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1574,6 +1574,23 @@
   ret <4 x i32> %shuffle
 }
 
+define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4i32_bitcast_0415:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_bitcast_0415:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4>
+  %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double>
+  %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32>
+  ret <4 x i32> %bitcast32
+}
+
 define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
 ; SSE-LABEL: insert_reg_and_zero_v4i32:
 ; SSE:       # BB#0:
Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -922,3 +922,22 @@
   %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer
   ret <4 x double> %1
 }
+
+define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: bitcast_v4f64_0426:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: bitcast_v4f64_0426:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpcklqdq  {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT:    retq
+  %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+  %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float>
+  %shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  %bitcast16 = bitcast <8 x float> %shuffle32 to <16 x i16>
+  %shuffle16 = shufflevector <16 x i16> %bitcast16, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
+  %bitcast64 = bitcast <16 x i16> %shuffle16 to <4 x double>
+  ret <4 x double> %bitcast64
+}
Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -8,14 +8,14 @@
 ; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X32-NEXT:    movlpd %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test0:
 ; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    retq
 entry:
@@ -84,16 +84,15 @@
 ; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    movl L_tmp_V2i$non_lazy_ptr, %eax
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    movlpd %xmm0, (%eax)
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT:    movlps %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test2:
 ; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    movq _tmp_V2i@{{.*}}(%rip), %rax
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; X64-NEXT:    movq %xmm0, (%rax)
 ; X64-NEXT:    retq
 entry: