diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5144,17 +5144,19 @@
 //  TLI:       Target lowering used to determine legal types.
 //  Width:     Width left need to load/store.
 //  WidenVT:   The widen vector type to load to/store from
-//  Align:     If 0, don't allow use of a wider type
-//  WidenEx:   If Align is not 0, the amount additional we can load/store from.
+//  NumDereferenceableBytes:     If 0, don't allow use of a wider type
+//  WidenEx:   If NumDereferenceableBytes is not 0,
+//             the additional amount we have to load/store.
 
-static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
+static EVT FindMemType(SelectionDAG &DAG, const TargetLowering &TLI,
                        unsigned Width, EVT WidenVT,
-                       unsigned Align = 0, unsigned WidenEx = 0) {
+                       unsigned NumDereferenceableBytes = 0,
+                       unsigned WidenEx = 0) {
   EVT WidenEltVT = WidenVT.getVectorElementType();
   const bool Scalable = WidenVT.isScalableVector();
   unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinSize();
   unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
-  unsigned AlignInBits = Align*8;
+  unsigned NumDereferenceableBits = NumDereferenceableBytes * 8;
 
   // If we have one element to load/store, return it.
   EVT RetVT = WidenEltVT;
@@ -5174,8 +5176,9 @@
            Action == TargetLowering::TypePromoteInteger) &&
           (WidenWidth % MemVTWidth) == 0 &&
           isPowerOf2_32(WidenWidth / MemVTWidth) &&
-          (MemVTWidth <= Width ||
-           (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+          (MemVTWidth <= Width || (NumDereferenceableBytes != 0 &&
+                                   MemVTWidth <= NumDereferenceableBits &&
+                                   MemVTWidth <= Width + WidenEx))) {
         if (MemVTWidth == WidenWidth)
           return MemVT;
         RetVT = MemVT;
@@ -5197,8 +5200,9 @@
         WidenEltVT == MemVT.getVectorElementType() &&
         (WidenWidth % MemVTWidth) == 0 &&
         isPowerOf2_32(WidenWidth / MemVTWidth) &&
-        (MemVTWidth <= Width ||
-         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+        (MemVTWidth <= Width || (NumDereferenceableBytes != 0 &&
+                                 MemVTWidth <= NumDereferenceableBits &&
+                                 MemVTWidth <= Width + WidenEx))) {
       if (RetVT.getFixedSizeInBits() < MemVTWidth || MemVT == WidenVT)
         return MemVT;
     }
@@ -5264,13 +5268,24 @@
   TypeSize LdWidth = LdVT.getSizeInBits();
   TypeSize WidenWidth = WidenVT.getSizeInBits();
   TypeSize WidthDiff = WidenWidth - LdWidth;
-  // Allow wider loads if they are sufficiently aligned to avoid memory faults
-  // and if the original load is simple.
-  unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
+  unsigned NumDereferenceableBytes = 0;
+  // Allow wider loads if the original load is simple and we can dereference
+  // padding bytes.
+  if (LD->isSimple()) {
+    NumDereferenceableBytes = LD->getAlignment();
+    if (!LdWidth.isScalable())
+      NumDereferenceableBytes =
+          std::max<unsigned>(NumDereferenceableBytes, LdWidth / 8);
+    if (!WidenWidth.isScalable() && NumDereferenceableBytes < WidenWidth / 8 &&
+        LD->getPointerInfo().isDereferenceable(
+            WidenWidth / 8, *DAG.getContext(), DAG.getDataLayout()))
+      NumDereferenceableBytes =
+          std::max<unsigned>(NumDereferenceableBytes, WidenWidth / 8);
+  }
 
   // Find the vector type that can load from.
-  EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
-                          WidthDiff.getKnownMinSize());
+  EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT,
+                          NumDereferenceableBytes, WidthDiff.getKnownMinSize());
   TypeSize NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
                              LD->getOriginalAlign(), MMOFlags, AAInfo);
@@ -5306,13 +5321,29 @@
   MachinePointerInfo MPI = LD->getPointerInfo();
   do {
     LdWidth -= NewVTWidth;
+    if (LD->isSimple()) {
+      if (!NewVTWidth.isScalable()) {
+        if (NumDereferenceableBytes > NewVTWidth / 8)
+          NumDereferenceableBytes -= NewVTWidth / 8;
+        else
+          NumDereferenceableBytes = 0;
+        NumDereferenceableBytes = std::max<unsigned>(
+            NumDereferenceableBytes,
+            commonAlignment(cast<LoadSDNode>(LdOp)->getOriginalAlign(),
+                            cast<LoadSDNode>(LdOp)->getSrcValueOffset() +
+                                NewVTWidth)
+                .value());
+      } else
+        NumDereferenceableBytes = 0; // FIXME
+    }
+
     IncrementPointer(cast<LoadSDNode>(LdOp), NewVT, MPI, BasePtr,
                      &ScaledOffset);
 
     if (TypeSize::isKnownLT(LdWidth, NewVTWidth)) {
       // The current type we are using is too large. Find a better size.
-      NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
-                          WidthDiff.getKnownMinSize());
+      NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT,
+                          NumDereferenceableBytes, WidthDiff.getKnownMinSize());
       NewVTWidth = NewVT.getSizeInBits();
     }
 
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1696,35 +1696,33 @@
 define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind {
 ; SI-LABEL: v5i8_arg:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xc
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:4
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v5i8_arg:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT:    s_load_dword s1, s[0:1], 0x30
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s2, 4
+; VI-NEXT:    s_add_u32 s4, s2, 4
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    s_addc_u32 s1, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_byte v[2:3], v4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_mov_b32_e32 v5, s0
+; VI-NEXT:    flat_store_byte v[2:3], v4
+; VI-NEXT:    flat_store_dword v[0:1], v5
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v5i8_arg:
@@ -2269,82 +2267,80 @@
 define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind {
 ; SI-LABEL: v5i64_arg:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x19
-; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x21
-; SI-NEXT:    s_mov_b32 s15, 0xf000
-; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
 ; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32
+; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v5i64_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x84
+; VI-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx16 s[0:15], s[0:1], 0x64
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    s_add_u32 s8, s2, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    s_addc_u32 s9, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s8
-; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_mov_b32_e32 v3, s11
-; VI-NEXT:    v_mov_b32_e32 v5, s9
-; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_add_u32 s4, s16, 16
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_addc_u32 s5, s17, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s16
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s16, 32
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s17
+; VI-NEXT:    s_addc_u32 s1, s17, 0
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_nop 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v5i64_arg:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x60
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx16 s[0:15], s[4:5], 0x40
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s12
-; GFX9-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-NEXT:    v_mov_b32_e32 v2, s14
-; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v5i64_arg:
@@ -2412,82 +2408,80 @@
 define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind {
 ; SI-LABEL: v5f64_arg:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x19
-; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x21
-; SI-NEXT:    s_mov_b32 s15, 0xf000
-; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
 ; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32
+; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v5f64_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x84
+; VI-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx16 s[0:15], s[0:1], 0x64
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    s_add_u32 s8, s2, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    s_addc_u32 s9, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s8
-; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_mov_b32_e32 v3, s11
-; VI-NEXT:    v_mov_b32_e32 v5, s9
-; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_add_u32 s4, s16, 16
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_addc_u32 s5, s17, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s16
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s16, 32
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s17
+; VI-NEXT:    s_addc_u32 s1, s17, 0
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_nop 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v5f64_arg:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x60
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx16 s[0:15], s[4:5], 0x40
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s12
-; GFX9-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-NEXT:    v_mov_b32_e32 v2, s14
-; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v5f64_arg:
diff --git a/llvm/test/CodeGen/X86/load-partial-dot-product.ll b/llvm/test/CodeGen/X86/load-partial-dot-product.ll
--- a/llvm/test/CodeGen/X86/load-partial-dot-product.ll
+++ b/llvm/test/CodeGen/X86/load-partial-dot-product.ll
@@ -130,14 +130,8 @@
 define float @dot3_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
 ; SSE2-LABEL: dot3_float3:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT:    movups (%rdi), %xmm0
+; SSE2-NEXT:    movups (%rsi), %xmm1
 ; SSE2-NEXT:    mulps %xmm0, %xmm1
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
@@ -148,14 +142,8 @@
 ;
 ; SSSE3-LABEL: dot3_float3:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSSE3-NEXT:    movups (%rdi), %xmm0
+; SSSE3-NEXT:    movups (%rsi), %xmm1
 ; SSSE3-NEXT:    mulps %xmm0, %xmm1
 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    addss %xmm1, %xmm0
@@ -165,10 +153,8 @@
 ;
 ; SSE41-LABEL: dot3_float3:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; SSE41-NEXT:    movups (%rdi), %xmm0
+; SSE41-NEXT:    movups (%rsi), %xmm1
 ; SSE41-NEXT:    mulps %xmm0, %xmm1
 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; SSE41-NEXT:    addss %xmm1, %xmm0
@@ -178,11 +164,8 @@
 ;
 ; AVX-LABEL: dot3_float3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovups (%rdi), %xmm0
+; AVX-NEXT:    vmulps (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0