diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5145,17 +5145,19 @@
 //  TLI:       Target lowering used to determine legal types.
 //  Width:     Width left need to load/store.
 //  WidenVT:   The widen vector type to load to/store from
-//  Align:     If 0, don't allow use of a wider type
-//  WidenEx:   If Align is not 0, the amount additional we can load/store from.
+//  NumDereferenceableBytes:     If 0, don't allow use of a wider type
+//  WidenEx:   If NumDereferenceableBytes is not 0,
+//             the additional amount we have to load/store.
 
-static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
+static EVT FindMemType(SelectionDAG &DAG, const TargetLowering &TLI,
                        unsigned Width, EVT WidenVT,
-                       unsigned Align = 0, unsigned WidenEx = 0) {
+                       unsigned NumDereferenceableBytes = 0,
+                       unsigned WidenEx = 0) {
   EVT WidenEltVT = WidenVT.getVectorElementType();
   const bool Scalable = WidenVT.isScalableVector();
   unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinSize();
   unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
-  unsigned AlignInBits = Align*8;
+  unsigned NumDereferenceableBits = NumDereferenceableBytes * 8;
 
   // If we have one element to load/store, return it.
   EVT RetVT = WidenEltVT;
@@ -5178,8 +5180,9 @@
            Action == TargetLowering::TypePromoteInteger) &&
           (WidenWidth % MemVTWidth) == 0 &&
           isPowerOf2_32(WidenWidth / MemVTWidth) &&
-          (MemVTWidth <= Width ||
-           (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+          (MemVTWidth <= Width || (NumDereferenceableBytes != 0 &&
+                                   MemVTWidth <= NumDereferenceableBits &&
+                                   MemVTWidth <= Width + WidenEx))) {
         if (MemVTWidth == WidenWidth)
           return MemVT;
         RetVT = MemVT;
@@ -5203,8 +5206,9 @@
         WidenEltVT == MemVT.getVectorElementType() &&
         (WidenWidth % MemVTWidth) == 0 &&
         isPowerOf2_32(WidenWidth / MemVTWidth) &&
-        (MemVTWidth <= Width ||
-         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+        (MemVTWidth <= Width || (NumDereferenceableBytes != 0 &&
+                                 MemVTWidth <= NumDereferenceableBits &&
+                                 MemVTWidth <= Width + WidenEx))) {
       if (RetVT.getFixedSizeInBits() < MemVTWidth || MemVT == WidenVT)
         return MemVT;
     }
@@ -5270,13 +5274,24 @@
   TypeSize LdWidth = LdVT.getSizeInBits();
   TypeSize WidenWidth = WidenVT.getSizeInBits();
   TypeSize WidthDiff = WidenWidth - LdWidth;
-  // Allow wider loads if they are sufficiently aligned to avoid memory faults
-  // and if the original load is simple.
-  unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
+  unsigned NumDereferenceableBytes = 0;
+  // Allow wider loads if the original load is simple and we can dereference
+  // padding bytes.
+  if (LD->isSimple()) {
+    NumDereferenceableBytes = LD->getAlignment();
+    if (!LdWidth.isScalable())
+      NumDereferenceableBytes =
+          std::max<unsigned>(NumDereferenceableBytes, LdWidth / 8);
+    if (!WidenWidth.isScalable() && NumDereferenceableBytes < WidenWidth / 8 &&
+        LD->getPointerInfo().isDereferenceable(
+            WidenWidth / 8, *DAG.getContext(), DAG.getDataLayout()))
+      NumDereferenceableBytes =
+          std::max<unsigned>(NumDereferenceableBytes, WidenWidth / 8);
+  }
 
   // Find the vector type that can load from.
-  EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
-                          WidthDiff.getKnownMinSize());
+  EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT,
+                          NumDereferenceableBytes, WidthDiff.getKnownMinSize());
   TypeSize NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
                              LD->getOriginalAlign(), MMOFlags, AAInfo);
@@ -5312,13 +5327,27 @@
   MachinePointerInfo MPI = LD->getPointerInfo();
   do {
     LdWidth -= NewVTWidth;
+    if (!NewVTWidth.isScalable()) {
+      if (NumDereferenceableBytes > NewVTWidth / 8)
+        NumDereferenceableBytes -= NewVTWidth / 8;
+      else
+        NumDereferenceableBytes = 0;
+      NumDereferenceableBytes = std::max<unsigned>(
+          NumDereferenceableBytes,
+          commonAlignment(cast<LoadSDNode>(LdOp)->getOriginalAlign(),
+                          cast<LoadSDNode>(LdOp)->getSrcValueOffset() +
+                              NewVTWidth)
+              .value());
+    } else
+      NumDereferenceableBytes = 0; // FIXME
+
     IncrementPointer(cast<LoadSDNode>(LdOp), NewVT, MPI, BasePtr,
                      &ScaledOffset);
 
     if (TypeSize::isKnownLT(LdWidth, NewVTWidth)) {
       // The current type we are using is too large. Find a better size.
-      NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
-                          WidthDiff.getKnownMinSize());
+      NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT,
+                          NumDereferenceableBytes, WidthDiff.getKnownMinSize());
       NewVTWidth = NewVT.getSizeInBits();
     }
 
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -383,7 +383,7 @@
 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 
-; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 
 ; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
@@ -454,12 +454,12 @@
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
-; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
-; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
-; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
-; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
+; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9
+; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
+; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
+; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind {
 entry:
   store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8
@@ -479,12 +479,12 @@
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
-; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
-; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
-; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
-; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
+; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9
+; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
+; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
+; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind {
 entry:
   store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/ARM/vector-load.ll b/llvm/test/CodeGen/ARM/vector-load.ll
--- a/llvm/test/CodeGen/ARM/vector-load.ll
+++ b/llvm/test/CodeGen/ARM/vector-load.ll
@@ -253,10 +253,8 @@
 }
 
 ; CHECK-LABEL: test_silly_load:
-; CHECK: vldr d{{[0-9]+}}, [r0, #16]
-; CHECK: movs r1, #24
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1
-; CHECK: ldr {{r[0-9]+}}, [r0]
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]!
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]
 
 define void @test_silly_load(<28 x i8>* %addr) {
   load volatile <28 x i8>, <28 x i8>* %addr
diff --git a/llvm/test/CodeGen/X86/load-partial-dot-product.ll b/llvm/test/CodeGen/X86/load-partial-dot-product.ll
--- a/llvm/test/CodeGen/X86/load-partial-dot-product.ll
+++ b/llvm/test/CodeGen/X86/load-partial-dot-product.ll
@@ -130,14 +130,8 @@
 define float @dot3_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
 ; SSE2-LABEL: dot3_float3:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT:    movups (%rdi), %xmm0
+; SSE2-NEXT:    movups (%rsi), %xmm1
 ; SSE2-NEXT:    mulps %xmm0, %xmm1
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
@@ -148,14 +142,8 @@
 ;
 ; SSSE3-LABEL: dot3_float3:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSSE3-NEXT:    movups (%rdi), %xmm0
+; SSSE3-NEXT:    movups (%rsi), %xmm1
 ; SSSE3-NEXT:    mulps %xmm0, %xmm1
 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    addss %xmm1, %xmm0
@@ -165,10 +153,8 @@
 ;
 ; SSE41-LABEL: dot3_float3:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; SSE41-NEXT:    movups (%rdi), %xmm0
+; SSE41-NEXT:    movups (%rsi), %xmm1
 ; SSE41-NEXT:    mulps %xmm0, %xmm1
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    addss %xmm1, %xmm0
@@ -178,11 +164,8 @@
 ;
 ; AVX-LABEL: dot3_float3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovups (%rdi), %xmm0
+; AVX-NEXT:    vmulps (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0