Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11985,6 +11985,30 @@
       (N1.getOpcode() == ISD::CONCAT_VECTORS &&
        N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
 
+    // Try to avoid smaller vectors by combining a shuffle of one
+    // CONCAT_VECTORS coming from a scalar to SCALAR_TO_VECTOR instead:
+    //   (vector_shuffle (v8i8 concat_vectors (v2i8 bitcast (i16)), undef..), M)
+    // ->
+    //   (vector_shuffle (v8i8 bitcast (v4i16 scalar_to_vector (i16))), M)
+    if (N1.getOpcode() == ISD::UNDEF &&
+        N0->getOperand(0)->getOpcode() == ISD::BITCAST &&
+        std::all_of(
+            N0->ops().begin() + 1, N0->ops().end(),
+            [](const SDValue &Op) { return Op->getOpcode() == ISD::UNDEF; })) {
+      SDValue Scalar = N0->getOperand(0)->getOperand(0);
+      EVT ScVT = Scalar.getValueType();
+      if (!ScVT.isVector()) {
+        SDLoc dl(N);
+        EVT VecVT = EVT::getVectorVT(*DAG.getContext(), ScVT,
+                                     VT.getSizeInBits() / ScVT.getSizeInBits());
+        return DAG.getVectorShuffle(
+            VT, dl,
+            DAG.getNode(ISD::BITCAST, dl, VT,
+                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Scalar)),
+            N1, SVN->getMask());
+      }
+    }
+
     // Try to simplify either the shuffle or the concats.
     if (SDValue V = partitionShuffleOfConcats(N, DAG))
       return V;
Index: test/CodeGen/AArch64/concat_vectors-combines.ll
===================================================================
--- test/CodeGen/AArch64/concat_vectors-combines.ll
+++ test/CodeGen/AArch64/concat_vectors-combines.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple arm64-apple-darwin -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-collect-loh=false -asm-verbose=false | FileCheck %s
+; LOHs are annoying, disable them.
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
@@ -53,4 +54,63 @@
   ret <8 x i8> %1
 }
 
+; Test the (vector_shuffle (concat_vectors (bitcast (scalar)), undef..), undef, <mask>) pattern.
+
+; FIXME: This should use DUP.
+define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup:
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0]
+; CHECK-NEXT: movi.4h v[[V1:[0-9]+]], #0x1, lsl #8
+; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i8> %1
+}
+
+define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8:
+; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}}
+; CHECK-NEXT: ldr d[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}}
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0]
+; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 1>
+  ret <8 x i8> %1
+}
+
+define <8 x i8> @test_shuffle_from_concat_scalar_v4i8_to_v8i8(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v4i8_to_v8i8:
+; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}}
+; CHECK-NEXT: ldr d[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}}
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0]
+; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <4 x i8>
+  %1 = shufflevector <4 x i8> %0, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16:
+; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}}
+; CHECK-NEXT: ldr q[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}}
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: tbl.16b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i16> %1
+}
+
 attributes #0 = { nounwind }