Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12051,9 +12051,9 @@
     }
   }
 
-  // If this shuffle only has a single input that is a bitcasted shuffle,
-  // attempt to merge the 2 shuffles and suitably bitcast the inputs/output
-  // back to their original types.
+  // If this shuffle only has a single input that is either a bitcast shuffle or
+  // scalar_to_vector, attempt to merge the 2 nodes and suitably bitcast the
+  // inputs/output back to their original types.
   if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
       N1.getOpcode() == ISD::UNDEF && Level < AfterLegalizeVectorOps &&
       TLI.isTypeLegal(VT)) {
@@ -12066,6 +12066,44 @@
       BC0 = BC0.getOperand(0);
     }
 
+    EVT SVT = VT.getScalarType();
+    EVT InnerVT = BC0->getValueType(0);
+    EVT InnerSVT = InnerVT.getScalarType();
+
+    // If the shuffle scalar type is smaller than the scalar_to_vector input
+    // type, try to express the shuffle in terms of the bigger scalar.
+    // This lets us recognize special shuffle patterns more easily.
+    if (BC0.getOpcode() == ISD::SCALAR_TO_VECTOR && BC0.hasOneUse() &&
+        InnerSVT.bitsGT(SVT) && TLI.isTypeLegal(InnerVT) &&
+        0 == (InnerSVT.getSizeInBits() % SVT.getSizeInBits())) {
+
+      int Scale = InnerSVT.getSizeInBits() / SVT.getSizeInBits();
+
+      // Look for either the repetition of the scalar, or undefs.
+      bool CanFold = true;
+      SmallVector<int, 8> NewMask;
+      for (int i = 0; i != NumElts; i += Scale) {
+        bool UndefRun = false;
+        for (int j = 0; j != Scale; ++j) {
+          int Idx = SVN->getMaskElt(i + j);
+          if (Idx < 0)
+            UndefRun = true;
+          else if (UndefRun || Idx != j)
+            CanFold = false;
+        }
+        if (!CanFold)
+          break;
+        NewMask.push_back(UndefRun ? -1 : 0);
+      }
+
+      if (CanFold && TLI.isShuffleMaskLegal(NewMask, InnerVT)) {
+        return DAG.getNode(ISD::BITCAST, SDLoc(N), VT,
+                           DAG.getVectorShuffle(InnerVT, SDLoc(N), BC0,
+                                                DAG.getUNDEF(InnerVT),
+                                                NewMask));
+      }
+    }
+
     auto ScaleShuffleMask = [](ArrayRef<int> Mask, int Scale) {
       if (Scale == 1)
         return SmallVector<int, 8>(Mask.begin(), Mask.end());
@@ -12078,10 +12116,6 @@
     };
 
     if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
-      EVT SVT = VT.getScalarType();
-      EVT InnerVT = BC0->getValueType(0);
-      EVT InnerSVT = InnerVT.getScalarType();
-
       // Determine which shuffle works with the smaller scalar type.
       EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
       EVT ScaleSVT = ScaleVT.getScalarType();
Index: test/CodeGen/AArch64/concat_vectors-combines.ll
===================================================================
--- test/CodeGen/AArch64/concat_vectors-combines.ll
+++ test/CodeGen/AArch64/concat_vectors-combines.ll
@@ -56,14 +56,10 @@
 
 ; Test the (vector_shuffle (concat_vectors (bitcast (scalar)), undef..), undef, <mask>) pattern.
 
-; FIXME: This should use DUP.
 define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 {
 entry:
 ; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup:
-; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
-; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0]
-; CHECK-NEXT: movi.4h v[[V1:[0-9]+]], #0x1, lsl #8
-; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: dup.4h v0, w0
 ; CHECK-NEXT: ret
   %t = trunc i32 %x to i16
   %0 = bitcast i16 %t to <2 x i8>
@@ -71,6 +67,29 @@
   ret <8 x i8> %1
 }
 
+define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup:
+; CHECK-NEXT: dup.4s v0, w0
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid:
+; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}}
+; CHECK-NEXT: ldr q[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}}
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: tbl.16b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
+  ret <8 x i16> %1
+}
+
 define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8(i32 %x) #0 {
 entry:
 ; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8: