diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23021,6 +23021,23 @@
         if (auto *Idx = dyn_cast<ConstantSDNode>(N0.getOperand(2)))
           if (Idx->getAPIntValue() == SplatIndex)
             return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(1));
+
+      // Look through a bitcast if LE and splatting lane 0, through to a
+      // scalar_to_vector or a build_vector.
+      if (N0.getOpcode() == ISD::BITCAST && N0.getOperand(0).hasOneUse() &&
+          SplatIndex == 0 && DAG.getDataLayout().isLittleEndian() &&
+          (N0.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR ||
+           N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR)) {
+        EVT N00VT = N0.getOperand(0).getValueType();
+        if (VT.getScalarSizeInBits() <= N00VT.getScalarSizeInBits() &&
+            VT.isInteger() && N00VT.isInteger()) {
+          EVT InVT =
+              TLI.getTypeToTransformTo(*DAG.getContext(), VT.getScalarType());
+          SDValue Op = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0),
+                                          SDLoc(N), InVT);
+          return DAG.getSplatBuildVector(VT, SDLoc(N), Op);
+        }
+      }
     }
 
     // If this is a bit convert that changes the element type of the vector but
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -508,8 +508,7 @@
 define <8 x i16> @bitcast_i64_v8i16(i64 %a) {
 ; CHECK-LABEL: bitcast_i64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    dup.8h v0, v0[0]
+; CHECK-NEXT:    dup.8h v0, w0
 ; CHECK-NEXT:    ret
   %b = bitcast i64 %a to <4 x i16>
   %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -2538,12 +2538,11 @@
 ; CHECK-LABEL: cmplx_mul_combined_re_im:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    lsr x8, x0, #16
+; CHECK-NEXT:    adrp x9, .LCPI196_0
 ; CHECK-NEXT:    fmov d4, x0
 ; CHECK-NEXT:    rev32 v5.8h, v0.8h
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    adrp x8, .LCPI196_0
-; CHECK-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI196_0]
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI196_0]
 ; CHECK-NEXT:    sqneg v2.8h, v1.8h
 ; CHECK-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
 ; CHECK-NEXT:    sqdmull v2.4s, v0.4h, v4.h[0]
diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
@@ -399,8 +399,6 @@
 define arm_aapcs_vfpcc <8 x i16> @bitcast_i128_v8i16(i128 %a) {
 ; CHECK-LE-LABEL: bitcast_i128_v8i16:
 ; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.32 q0[0], r0
-; CHECK-LE-NEXT:    vmov.u16 r0, q0[0]
 ; CHECK-LE-NEXT:    vdup.16 q0, r0
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -549,8 +547,6 @@
 define arm_aapcs_vfpcc <8 x i16> @other_max_case(i32 %blockSize) {
 ; CHECK-LE-LABEL: other_max_case:
 ; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    vmov.32 q0[0], r0
-; CHECK-LE-NEXT:    vmov.u16 r0, q0[0]
 ; CHECK-LE-NEXT:    vdup.16 q0, r0
 ; CHECK-LE-NEXT:    bx lr
 ;
diff --git a/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll b/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll
--- a/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll
@@ -1,8 +1,7 @@
 ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
 
 ; Test that a splat shuffle of an fp-to-int bitcasted vector correctly
-; optimizes and lowers to a single splat instruction. Without a custom
-; DAG combine, this ends up doing both a splat and a shuffle.
+; optimizes and lowers to a single splat instruction.
 
 target triple = "wasm32-unknown-unknown"
 
@@ -19,8 +18,8 @@
 
 ; CHECK-LABEL: not_a_vec:
 ; CHECK-NEXT: .functype not_a_vec (i64, i64) -> (v128){{$}}
-; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $pop[[L1]], $2, 0, 1, 2, 3
+; CHECK-NEXT: i32.wrap_i64    $push[[L:[0-9]+]]=, $0
+; CHECK-NEXT: i32x4.splat     $push[[R:[0-9]+]]=, $pop[[L]]
 ; CHECK-NEXT: return $pop[[R]]
 define <4 x i32> @not_a_vec(i128 %x) {
   %a = bitcast i128 %x to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
--- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -74,8 +74,8 @@
 ; X64-LABEL: test2:
 ; X64:       ## %bb.0: ## %entry
 ; X64-NEXT:    movq _tmp_V2i@GOTPCREL(%rip), %rax
-; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X64-NEXT:    movq %xmm0, (%rax)
 ; X64-NEXT:    retq
 entry: