diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23459,6 +23459,47 @@
                               InnerShuf->getOperand(1), CombinedMask);
 }
 
+/// Given a vectorized load used in a splat, scalarize the load to only load the
+/// element required for splatting.
+static SDValue scalarizeLoadIntoSplat(ShuffleVectorSDNode *Shuf,
+                                      SelectionDAG &DAG) {
+  if (!Shuf->isSplat())
+    return SDValue();
+
+  EVT VecVT = Shuf->getValueType(0);
+  unsigned SplatIdx = Shuf->getSplatIndex();
+  SDValue SplattedOp;
+  if (SplatIdx < VecVT.getVectorNumElements())
+    SplattedOp = Shuf->getOperand(0);
+  else
+    SplattedOp = Shuf->getOperand(1);
+
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(Shuf->getOperand(0).getNode());
+  if (!Load)
+    return SDValue();
+
+  if (!(Load->isSimple() && Load->hasOneUse() && VecVT.isVector()))
+    return SDValue();
+
+  auto &TLI = DAG.getTargetLoweringInfo();
+  SDValue SplatIdxV =
+      DAG.getConstant(SplatIdx, SDLoc(Shuf), MVT::i32);
+  SDValue NewPtr =
+      TLI.getVectorElementPointer(DAG, Load->getBasePtr(), VecVT, SplatIdxV);
+
+  EVT VecEltVT = VecVT.getVectorElementType();
+  unsigned PtrOff = VecEltVT.getSizeInBits() * SplatIdx / 8;
+  MachinePointerInfo MPI = Load->getPointerInfo().getWithOffset(PtrOff);
+  Align Alignment = commonAlignment(Load->getAlign(), PtrOff);
+
+  auto NewLoad = DAG.getLoad(VecEltVT, SDLoc(Load), Load->getChain(), NewPtr,
+                             MPI, Alignment, Load->getMemOperand()->getFlags(),
+                             Load->getAAInfo());
+  DAG.makeEquivalentMemoryOrdering(Load, NewLoad);
+
+  return DAG.getSplatBuildVector(Shuf->getValueType(0), SDLoc(Shuf), NewLoad);
+}
+
 /// If the shuffle mask is taking exactly one element from the first vector
 /// operand and passing through all other elements from the second vector
 /// operand, return the index of the mask element that is choosing an element
@@ -23614,6 +23655,9 @@
   if (SDValue V = formSplatFromShuffles(SVN, DAG))
     return V;
 
+  if (SDValue V = scalarizeLoadIntoSplat(SVN, DAG))
+    return V;
+
   // If it is a splat, check if the argument vector is another splat or a
   // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -716,8 +716,7 @@
 ; CHECK-LABEL: load_splat_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %p
@@ -729,8 +728,7 @@
 ; CHECK-LABEL: load_splat_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %p
@@ -742,8 +740,7 @@
 ; CHECK-LABEL: load_splat_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.b, b0
+; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <32 x i8>, ptr %p
@@ -755,8 +752,7 @@
 ; CHECK-LABEL: load_splat_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <16 x i16>, ptr %p
@@ -768,8 +764,7 @@
 ; CHECK-LABEL: load_splat_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <8 x i32>, ptr %p
@@ -781,8 +776,7 @@
 ; CHECK-LABEL: load_splat_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
 ; CHECK-NEXT:    ret
   %v = load <4 x i64>, ptr %p
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -135,24 +135,28 @@
 ; GFX9-LABEL: shuffle_v4f16_u3uu:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:6
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_u3uu:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off offset:6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_u3uu:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:6
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1168,7 +1172,7 @@
 ; GFX9-LABEL: shuffle_v4f16_0000:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
@@ -1179,7 +1183,7 @@
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -1189,7 +1193,7 @@
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1902,31 +1906,31 @@
 ; GFX9-LABEL: hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    global_load_dword v2, v[2:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_short_d16 v2, v[0:1], off offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: hi16bits:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v2, v[2:3], off
+; GFX10-NEXT:    global_load_short_d16 v2, v[0:1], off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: hi16bits:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_d16_b16 v2, v[0:1], off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
@@ -1978,30 +1982,31 @@
 ; GFX9-LABEL: hi16low16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v4, v[0:1], off offset:2
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: hi16low16bits:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v4, v[0:1], off offset:2
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: hi16low16bits:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:2
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
@@ -2091,30 +2096,31 @@
 ; GFX9-LABEL: i16_hi16low16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v4, v[0:1], off offset:2
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: i16_hi16low16bits:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v4, v[0:1], off offset:2
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i16_hi16low16bits:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:2
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
@@ -2128,31 +2134,31 @@
 ; GFX9-LABEL: i16_hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    global_load_dword v2, v[2:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_short_d16 v2, v[0:1], off offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: i16_hi16bits:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v2, v[2:3], off
+; GFX10-NEXT:    global_load_short_d16 v2, v[0:1], off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i16_hi16bits:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_d16_b16 v2, v[0:1], off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
--- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
+++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
@@ -8,9 +8,10 @@
 define i32 @foo(ptr %descs, i32 %num, i32 %cw) local_unnamed_addr #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr d16, [r0, #32]
+; CHECK-NEXT:    add r0, r0, #32
+; CHECK-NEXT:    vld1.32 {d16[]}, [r0:32]
 ; CHECK-NEXT:    vadd.i32 d16, d16, d16
-; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    vmov.32 r0, d16[1]
 ; CHECK-NEXT:    bx lr
 entry:
   %wide.vec = load <16 x i32>, ptr %descs, align 4
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -1127,16 +1127,11 @@
 ;
 ; CHECK-NOVSX-LABEL: testSplati64_1:
 ; CHECK-NOVSX:       # %bb.0: # %entry
-; CHECK-NOVSX-NEXT:    ld r4, 8(r3)
-; CHECK-NOVSX-NEXT:    std r4, -8(r1)
-; CHECK-NOVSX-NEXT:    addis r4, r2, .LCPI21_0@toc@ha
-; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
-; CHECK-NOVSX-NEXT:    addi r4, r4, .LCPI21_0@toc@l
-; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    ld r3, 8(r3)
+; CHECK-NOVSX-NEXT:    addi r4, r1, -16
+; CHECK-NOVSX-NEXT:    std r3, -8(r1)
 ; CHECK-NOVSX-NEXT:    std r3, -16(r1)
-; CHECK-NOVSX-NEXT:    addi r3, r1, -16
-; CHECK-NOVSX-NEXT:    lvx v3, 0, r3
-; CHECK-NOVSX-NEXT:    vperm v2, v3, v3, v2
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
 ; CHECK-NOVSX-NEXT:    blr
 ;
 ; CHECK-P7-LABEL: testSplati64_1:
@@ -1145,11 +1140,26 @@
 ; CHECK-P7-NEXT:    lxvdsx v2, 0, r3
 ; CHECK-P7-NEXT:    blr
 ;
-; P8-AIX-LABEL: testSplati64_1:
-; P8-AIX:       # %bb.0: # %entry
-; P8-AIX-NEXT:    addi r3, r3, 8
-; P8-AIX-NEXT:    lxvdsx v2, 0, r3
-; P8-AIX-NEXT:    blr
+; P8-AIX-64-LABEL: testSplati64_1:
+; P8-AIX-64:       # %bb.0: # %entry
+; P8-AIX-64-NEXT:    addi r3, r3, 8
+; P8-AIX-64-NEXT:    lxvdsx v2, 0, r3
+; P8-AIX-64-NEXT:    blr
+;
+; P8-AIX-32-LABEL: testSplati64_1:
+; P8-AIX-32:       # %bb.0: # %entry
+; P8-AIX-32-NEXT:    lwz r4, L..C4(r2) # %const.0
+; P8-AIX-32-NEXT:    lwz r5, 12(r3)
+; P8-AIX-32-NEXT:    lwz r3, 8(r3)
+; P8-AIX-32-NEXT:    stw r5, -16(r1)
+; P8-AIX-32-NEXT:    stw r3, -32(r1)
+; P8-AIX-32-NEXT:    addi r3, r1, -16
+; P8-AIX-32-NEXT:    lxvw4x v2, 0, r4
+; P8-AIX-32-NEXT:    addi r4, r1, -32
+; P8-AIX-32-NEXT:    lxvw4x v3, 0, r3
+; P8-AIX-32-NEXT:    lxvw4x v4, 0, r4
+; P8-AIX-32-NEXT:    vperm v2, v4, v3, v2
+; P8-AIX-32-NEXT:    blr
 entry:
   %0 = load <2 x i64>, ptr %ptr, align 8
   %1 = shufflevector <2 x i64> %0, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
diff --git a/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Ensures that vectorized loads that are really just splatted loads, are indeed
+; selected as splatted loads
+
+target triple = "wasm32-unknown-unknown"
+
+define <4 x i32> @load_splat_shuhffle_lhs(ptr %p) {
+; CHECK-LABEL: load_splat_shuhffle_lhs:
+; CHECK:         .functype load_splat_shuhffle_lhs (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load64_splat 0
+; CHECK-NEXT:    # fallthrough-return
+  %a = load <2 x i64>, ptr %p
+  %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
+  %c = bitcast <2 x i64> %b to <4 x i32>
+  %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @load_splat_shuffle_lhs_with_offset(ptr %p) {
+; CHECK-LABEL: load_splat_shuffle_lhs_with_offset:
+; CHECK:         .functype load_splat_shuffle_lhs_with_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load64_splat 0
+; CHECK-NEXT:    # fallthrough-return
+  %a = load <2 x i64>, ptr %p
+  %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 undef>
+  %c = bitcast <2 x i64> %b to <4 x i32>
+  %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @load_splat_shuffle_rhs(ptr %p) {
+; CHECK-LABEL: load_splat_shuffle_rhs:
+; CHECK:         .functype load_splat_shuffle_rhs (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load64_splat 0
+; CHECK-NEXT:    # fallthrough-return
+  %a = load <2 x i64>, ptr %p
+  %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> <i32 2, i32 undef>
+  %c = bitcast <2 x i64> %b to <4 x i32>
+  %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @load_splat_shuffle_rhs_with_offset(ptr %p) {
+; CHECK-LABEL: load_splat_shuffle_rhs_with_offset:
+; CHECK:         .functype load_splat_shuffle_rhs_with_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load64_splat 0
+; CHECK-NEXT:    # fallthrough-return
+  %a = load <2 x i64>, ptr %p
+  %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> <i32 3, i32 undef>
+  %c = bitcast <2 x i64> %b to <4 x i32>
+  %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i32> %d
+}
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -364,18 +364,18 @@
   ret <4 x i32> %vecinit6.i
 }
 
-; FIXME: Pointer adjusted broadcasts
+; Pointer adjusted broadcasts
 
 define <4 x i32> @load_splat_4i32_4i32_1111(ptr %ptr) nounwind uwtable readnone ssp {
 ; X86-LABEL: load_splat_4i32_4i32_1111:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; X86-NEXT:    vbroadcastss 4(%eax), %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: load_splat_4i32_4i32_1111:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; X64-NEXT:    vbroadcastss 4(%rdi), %xmm0
 ; X64-NEXT:    retq
 entry:
   %ld = load <4 x i32>, ptr %ptr
@@ -472,12 +472,12 @@
 ; X86-LABEL: load_splat_2i64_2i64_1111:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
+; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: load_splat_2i64_2i64_1111:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
+; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; X64-NEXT:    retq
 entry:
   %ld = load <2 x i64>, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -84,7 +84,7 @@
 ; X32-SSE2-LABEL: t4:
 ; X32-SSE2:       # %bb.0:
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT:    movdqa (%eax), %xmm0
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X32-SSE2-NEXT:    movd %xmm0, %eax
 ; X32-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X32-SSE2-NEXT:    movd %xmm0, %edx
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1332,23 +1332,24 @@
 define <8 x half> @shuffle(ptr %p) {
 ; CHECK-LIBCALL-LABEL: shuffle:
 ; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    movdqu (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-LIBCALL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-LIBCALL-NEXT:    pinsrw $0, 8(%rdi), %xmm0
+; CHECK-LIBCALL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-LIBCALL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: shuffle:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,4,4,4,4]
-; BWON-F16C-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; BWON-F16C-NEXT:    vpinsrw $0, 8(%rdi), %xmm0, %xmm0
+; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; BWON-F16C-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: shuffle:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    movdqu (%eax), %xmm0
-; CHECK-I686-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-I686-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-I686-NEXT:    pinsrw $0, 8(%eax), %xmm0
+; CHECK-I686-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-I686-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; CHECK-I686-NEXT:    retl
   %1 = load <8 x half>, ptr %p, align 8
   %2 = shufflevector <8 x half> %1, <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll
--- a/llvm/test/CodeGen/X86/sse3.ll
+++ b/llvm/test/CodeGen/X86/sse3.ll
@@ -395,14 +395,14 @@
 define <4 x i32> @t17() nounwind {
 ; X86-LABEL: t17:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t17:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; X64-NEXT:    retq
 entry:
   %tmp1 = load <4 x float>, ptr undef, align 16
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1587,9 +1587,8 @@
 ; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08]
-; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-SSE-NEXT:    insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
@@ -1608,9 +1607,8 @@
 ;
 ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f]
-; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-SSE-NEXT:    insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1222,7 +1222,7 @@
 define <2 x double> @insert_dup_mem128_v2f64(ptr %ptr) nounwind {
 ; SSE2-LABEL: insert_dup_mem128_v2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps (%rdi), %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll
--- a/llvm/test/CodeGen/X86/widened-broadcast.ll
+++ b/llvm/test/CodeGen/X86/widened-broadcast.ll
@@ -468,7 +468,7 @@
 define <4 x float> @load_splat_4f32_8f32_0000(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_4f32_8f32_0000:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;