Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11499,6 +11499,34 @@
   return SDValue();
 }
 
+static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If the operands are legal vectors, leave them alone.
+  if (TLI.isTypeLegal(N->getOperand(0).getValueType()))
+    return SDValue();
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SmallVector<SDValue, 8> Ops;
+  EVT SVT = EVT::getIntegerVT(*DAG.getContext(),
+                              N->getOperand(0).getValueSizeInBits());
+  for (const SDValue &Op : N->ops()) {
+    if (ISD::BITCAST == Op.getOpcode() &&
+        !Op.getOperand(0).getValueType().isVector())
+      Ops.push_back(DAG.getNode(ISD::BITCAST, dl, SVT, Op.getOperand(0)));
+    else if (ISD::UNDEF == Op.getOpcode())
+      Ops.push_back(DAG.getUNDEF(SVT));
+    else
+      return SDValue();
+  }
+
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
+                               VT.getSizeInBits() / SVT.getSizeInBits());
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops));
+}
+
 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of
   // EXTRACT_SUBVECTOR operations.  If so, and if the EXTRACT_SUBVECTOR vector
@@ -11601,6 +11629,10 @@
     return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds);
   }
 
+  // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
+  if (SDValue V = combineConcatVectorOfScalars(N, DAG))
+    return V;
+
   // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
   // nodes often generate nop CONCAT_VECTOR nodes.
   // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
Index: test/CodeGen/AArch64/concat_vector-scalar-combine.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/concat_vector-scalar-combine.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -aarch64-collect-loh=false -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Test the (vector_shuffle (concat_vectors (bitcast (scalar)), undef..), undef, <mask>) pattern.
+
+define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup:
+; CHECK-NEXT: dup.4h v0, w0
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i8> %1
+}
+
+define <8 x i8> @test_shuffle_from_concat_scalar_v4i8_to_v8i8_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v4i8_to_v8i8_dup:
+; CHECK-NEXT: dup.2s v0, w0
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <4 x i8>
+  %1 = shufflevector <4 x i8> %0, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup:
+; CHECK-NEXT: dup.4s v0, w0
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid:
+; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], {{.*}}CPI{{.*}}
+; CHECK-NEXT: ldr q[[V1:[0-9]+]], [x[[MASKPTR]], {{.*}}CPI{{.*}}
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: tbl.16b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
+  ret <8 x i16> %1
+}
+
+define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8:
+; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], {{.*}}CPI{{.*}}
+; CHECK-NEXT: ldr d[[V1:[0-9]+]], [x[[MASKPTR]], {{.*}}CPI{{.*}}
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0]
+; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 1>
+  ret <8 x i8> %1
+}
+
+define <8 x i8> @test_shuffle_from_concat_scalar_v4i8_to_v8i8(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v4i8_to_v8i8:
+; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], {{.*}}CPI{{.*}}
+; CHECK-NEXT: ldr d[[V1:[0-9]+]], [x[[MASKPTR]], {{.*}}CPI{{.*}}
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0]
+; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <4 x i8>
+  %1 = shufflevector <4 x i8> %0, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16:
+; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], {{.*}}CPI{{.*}}
+; CHECK-NEXT: ldr q[[V1:[0-9]+]], [x[[MASKPTR]], {{.*}}CPI{{.*}}
+; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0
+; CHECK-NEXT: tbl.16b v0, { v[[V0]] }, v[[V1]]
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i16> %1
+}
+
+attributes #0 = { nounwind }