Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11861,17 +11861,55 @@
 
 static SDValue combineShuffleOfScalarInputs(SDNode *N, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT ResVT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
 
+  // If we only have one input, peek through bitcasts only if there is one user.
+  // FIXME: Is it useful to look at bitcasts on both sides?
+  if (N1.getOpcode() == ISD::UNDEF)
+    while (N0.getOpcode() == ISD::BITCAST) {
+      if (!N0.hasOneUse())
+        break;
+      N0 = N0.getOperand(0);
+    }
+
+  // The bitcast source needs to have an element size that's a multiple of
+  // the shuffle element size.  If it doesn't, revert to the shuffle operand.
+  if (!N0.getValueType().isVector() ||
+      (N0.getValueType().getScalarSizeInBits() % ResVT.getScalarSizeInBits()))
+    N0 = N->getOperand(0);
+
   EVT VT = N0.getValueType();
   EVT SVT = VT.getScalarType();
   const unsigned NumElts = VT.getVectorNumElements();
+  const unsigned ResNumElts = ResVT.getVectorNumElements();
+
+  const int Scale = SVT.getSizeInBits() / ResVT.getScalarSizeInBits();
+  assert((ResNumElts % Scale) == 0);
 
   SmallVector<SDValue, 8> Ops;
-  for (int M : SVN->getMask()) {
+  for (unsigned i = 0; i != ResNumElts; i += Scale) {
     SDValue Op = DAG.getUNDEF(SVT);
+    int M = SVN->getMaskElt(i);
+
+    if (Scale > 1) {
+      // Normalize undef indices.
+      if (M < 0)
+        M = -Scale;
+      if (M % Scale)
+        return SDValue();
+      // Make sure these are either all undef or consecutive elements.
+      for (int j = 0; j != Scale; ++j) {
+        int InnerM = SVN->getMaskElt(i + j);
+        if (((InnerM < 0) != (M < 0)) ||
+            (InnerM >= 0 && InnerM != M + (int)j))
+          return SDValue();
+      }
+      M /= Scale;
+    }
+
     if (M >= 0) {
       int Idx = M % NumElts;
       SDValue &S = (M < (int)NumElts ? N0 : N1);
@@ -11897,7 +11935,8 @@
       Op = TLI.isZExtFree(Op.getValueType(), SVT)
                ? DAG.getZExtOrTrunc(Op, SDLoc(N), SVT)
                : DAG.getSExtOrTrunc(Op, SDLoc(N), SVT);
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Ops);
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), ResVT,
+                     DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Ops));
 }
 
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
@@ -12055,7 +12094,7 @@
       return V;
   }
 
-  // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
+  // Attempt to combine a shuffle of 1 or 2 inputs of 'scalar sources' -
   // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
   if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) {
     if (SDValue V = combineShuffleOfScalarInputs(N, DAG))
Index: test/CodeGen/AArch64/concat_vectors-combines.ll
===================================================================
--- test/CodeGen/AArch64/concat_vectors-combines.ll
+++ test/CodeGen/AArch64/concat_vectors-combines.ll
@@ -56,14 +56,10 @@
 
 ; Test the (vector_shuffle (concat_vectors (bitcast (scalar)), undef..), undef, <mask>) pattern.
 
-; FIXME: This should use DUP.
 define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 {
 entry:
 ; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup:
-; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
-; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0]
-; CHECK-NEXT: movi.4h v[[V1:[0-9]+]], #0x1, lsl #8
-; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: dup.4h v0, w0
 ; CHECK-NEXT: ret
   %t = trunc i32 %x to i16
   %0 = bitcast i16 %t to <2 x i8>
@@ -71,6 +67,29 @@
   ret <8 x i8> %1
 }
 
+define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup:
+; CHECK-NEXT: dup.4s v0, w0
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid:
+; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}}
+; CHECK-NEXT: ldr q[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}}
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: tbl.16b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
+  ret <8 x i16> %1
+}
+
 define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8(i32 %x) #0 {
 entry:
 ; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8:
Index: test/CodeGen/ARM/vector-DAGCombine.ll
===================================================================
--- test/CodeGen/ARM/vector-DAGCombine.ll
+++ test/CodeGen/ARM/vector-DAGCombine.ll
@@ -48,7 +48,8 @@
   %2 = bitcast double %1 to i64
   %3 = insertelement <1 x i64> undef, i64 %2, i32 0
 ; CHECK-NOT: vmov s
-; CHECK: vext.8
+; CHECK: vmov r0, r1, d
+; CHECK: vmov r2, r3, d
   %4 = shufflevector <1 x i64> %3, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
   %tmp2006.3 = bitcast <2 x i64> %4 to <16 x i8>
   %5 = shufflevector <16 x i8> %tmp2006.3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>