diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22834,6 +22834,46 @@
                               InnerShuf->getOperand(1), CombinedMask);
 }
 
+/// Given a vectorized load used in a splat, scalarize the load to only load the
+/// element required for splatting.
+static SDValue scalarizeLoadIntoSplat(ShuffleVectorSDNode *Shuf,
+                                      SelectionDAG &DAG) {
+  if (!Shuf->isSplat())
+    return SDValue();
+
+  EVT VecVT = Shuf->getOperand(0).getValueType();
+  SDValue SplattedOp;
+  if ((unsigned)Shuf->getSplatIndex() >= VecVT.getVectorNumElements())
+    SplattedOp = Shuf->getOperand(0);
+  else
+    SplattedOp = Shuf->getOperand(1);
+
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(Shuf->getOperand(0).getNode());
+  if (!Load)
+    return SDValue();
+
+  if (!(Load->isSimple() && Load->hasOneUse() && VecVT.isVector()))
+    return SDValue();
+
+  auto &TLI = DAG.getTargetLoweringInfo();
+  SDValue SplatIdx =
+      DAG.getConstant(Shuf->getSplatIndex(), SDLoc(Shuf), MVT::i32);
+  SDValue NewPtr =
+      TLI.getVectorElementPointer(DAG, Load->getBasePtr(), VecVT, SplatIdx);
+
+  EVT VecEltVT = VecVT.getVectorElementType();
+  unsigned PtrOff = VecEltVT.getSizeInBits() * Shuf->getSplatIndex() / 8;
+  MachinePointerInfo MPI = Load->getPointerInfo().getWithOffset(PtrOff);
+  Align Alignment = commonAlignment(Load->getAlign(), PtrOff);
+
+  auto NewLoad = DAG.getLoad(VecEltVT, SDLoc(Load), Load->getChain(), NewPtr,
+                             MPI, Alignment, Load->getMemOperand()->getFlags(),
+                             Load->getAAInfo());
+  DAG.makeEquivalentMemoryOrdering(Load, NewLoad);
+
+  return DAG.getSplatBuildVector(Shuf->getValueType(0), SDLoc(Shuf), NewLoad);
+}
+
 /// If the shuffle mask is taking exactly one element from the first vector
 /// operand and passing through all other elements from the second vector
 /// operand, return the index of the mask element that is choosing an element
@@ -22989,6 +23029,9 @@
   if (SDValue V = formSplatFromShuffles(SVN, DAG))
     return V;
 
+  if (SDValue V = scalarizeLoadIntoSplat(SVN, DAG))
+    return V;
+
   // If it is a splat, check if the argument vector is another splat or a
   // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -716,8 +716,7 @@
 ; CHECK-LABEL: load_splat_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %p
@@ -729,8 +728,7 @@
 ; CHECK-LABEL: load_splat_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %p
@@ -742,8 +740,7 @@
 ; CHECK-LABEL: load_splat_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.b, b0
+; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <32 x i8>, ptr %p
@@ -755,8 +752,7 @@
 ; CHECK-LABEL: load_splat_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <16 x i16>, ptr %p
@@ -768,8 +764,7 @@
 ; CHECK-LABEL: load_splat_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <8 x i32>, ptr %p
@@ -781,8 +776,7 @@
 ; CHECK-LABEL: load_splat_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <4 x i64>, ptr %p
diff --git a/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Ensures that vectorized loads that are really just splatted loads, are indeed
+; selected as splatted loads
+
+target triple = "wasm32-unknown-unknown"
+
+define <4 x i32> @load_splat_shuhffle_lhs(ptr %p) {
+; CHECK-LABEL: load_splat_shuhffle_lhs:
+; CHECK:         .functype load_splat_shuhffle_lhs (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load64_splat 0
+; CHECK-NEXT:    # fallthrough-return
+  %a = load <2 x i64>, ptr %p
+  %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %c = bitcast <2 x i64> %b to <4 x i32>
+  %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @load_splat_shuffle_lhs_with_offset(ptr %p) {
+; CHECK-LABEL: load_splat_shuffle_lhs_with_offset:
+; CHECK:         .functype load_splat_shuffle_lhs_with_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load64_splat 0
+; CHECK-NEXT:    # fallthrough-return
+  %a = load <2 x i64>, ptr %p
+  %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 undef>
+  %c = bitcast <2 x i64> %b to <4 x i32>
+  %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @load_splat_shuffle_rhs(ptr %p) {
+; CHECK-LABEL: load_splat_shuffle_rhs:
+; CHECK:         .functype load_splat_shuffle_rhs (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load64_splat 0
+; CHECK-NEXT:    # fallthrough-return
+  %a = load <2 x i64>, ptr %p
+  %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> <i32 2, i32 undef>
+  %c = bitcast <2 x i64> %b to <4 x i32>
+  %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @load_splat_shuffle_rhs_with_offset(ptr %p) {
+; CHECK-LABEL: load_splat_shuffle_rhs_with_offset:
+; CHECK:         .functype load_splat_shuffle_rhs_with_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load64_splat 0
+; CHECK-NEXT:    # fallthrough-return
+  %a = load <2 x i64>, ptr %p
+  %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> <i32 3, i32 undef>
+  %c = bitcast <2 x i64> %b to <4 x i32>
+  %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i32> %d
+}
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -84,7 +84,7 @@
 ; X32-SSE2-LABEL: t4:
 ; X32-SSE2:       # %bb.0:
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT:    movdqa (%eax), %xmm0
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X32-SSE2-NEXT:    movd %xmm0, %eax
 ; X32-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X32-SSE2-NEXT:    movd %xmm0, %edx
diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll
--- a/llvm/test/CodeGen/X86/sse3.ll
+++ b/llvm/test/CodeGen/X86/sse3.ll
@@ -395,14 +395,14 @@
 define <4 x i32> @t17() nounwind {
 ; X86-LABEL: t17:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t17:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; X64-NEXT:    retq
 entry:
   %tmp1 = load <4 x float>, ptr undef, align 16
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1587,9 +1587,8 @@
 ; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08]
-; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-SSE-NEXT:    insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
@@ -1608,9 +1607,8 @@
 ;
 ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f]
-; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-SSE-NEXT:    insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1222,7 +1222,7 @@
 define <2 x double> @insert_dup_mem128_v2f64(ptr %ptr) nounwind {
 ; SSE2-LABEL: insert_dup_mem128_v2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps (%rdi), %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll
--- a/llvm/test/CodeGen/X86/widened-broadcast.ll
+++ b/llvm/test/CodeGen/X86/widened-broadcast.ll
@@ -468,7 +468,7 @@
 define <4 x float> @load_splat_4f32_8f32_0000(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_4f32_8f32_0000:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;