diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5274,6 +5274,11 @@
   // and if the original load is simple.
   unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
 
+  if (LdAlign && !WidenWidth.isScalable() && LdAlign < WidenWidth / 8 &&
+      LD->getPointerInfo().isDereferenceable(WidenWidth / 8, *DAG.getContext(),
+                                             DAG.getDataLayout()))
+    LdAlign = WidenWidth / 8;
+
   // Find the vector type that can load from.
   EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
                           WidthDiff.getKnownMinSize());
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -383,7 +383,7 @@
 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 
-; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 
 ; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
@@ -454,12 +454,12 @@
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
-; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
-; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
-; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
-; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
+; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9
+; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
+; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
+; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind {
 entry:
   store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8
@@ -479,12 +479,12 @@
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
-; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
-; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
-; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
-; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
+; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9
+; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
+; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
+; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind {
 entry:
   store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/X86/load-partial-dot-product.ll b/llvm/test/CodeGen/X86/load-partial-dot-product.ll
--- a/llvm/test/CodeGen/X86/load-partial-dot-product.ll
+++ b/llvm/test/CodeGen/X86/load-partial-dot-product.ll
@@ -130,14 +130,8 @@
 define float @dot3_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
 ; SSE2-LABEL: dot3_float3:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT:    movups (%rdi), %xmm0
+; SSE2-NEXT:    movups (%rsi), %xmm1
 ; SSE2-NEXT:    mulps %xmm0, %xmm1
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
@@ -148,14 +142,8 @@
 ;
 ; SSSE3-LABEL: dot3_float3:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSSE3-NEXT:    movups (%rdi), %xmm0
+; SSSE3-NEXT:    movups (%rsi), %xmm1
 ; SSSE3-NEXT:    mulps %xmm0, %xmm1
 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    addss %xmm1, %xmm0
@@ -165,10 +153,8 @@
 ;
 ; SSE41-LABEL: dot3_float3:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; SSE41-NEXT:    movups (%rdi), %xmm0
+; SSE41-NEXT:    movups (%rsi), %xmm1
 ; SSE41-NEXT:    mulps %xmm0, %xmm1
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    addss %xmm1, %xmm0
@@ -178,11 +164,8 @@
 ;
 ; AVX-LABEL: dot3_float3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovups (%rdi), %xmm0
+; AVX-NEXT:    vmulps (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0