diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1909,10 +1909,10 @@
   /// the object whose address is being passed. If so then MinSize is set to the
   /// minimum size the object must be to be aligned and PrefAlign is set to the
   /// preferred alignment.
-  virtual bool shouldAlignPointerArgs(CallInst * /*CI*/, unsigned & /*MinSize*/,
-                                      Align & /*PrefAlign*/) const {
-    return false;
-  }
+  virtual bool shouldUpdatePointerArgAlignment(const CallInst *CI,
+                                               const Value *Arg,
+                                               unsigned &MinSize,
+                                               Align &PrefAlign) const;
 
   //===--------------------------------------------------------------------===//
   /// \name Helpers for TargetTransformInfo implementations
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2221,16 +2221,16 @@
 
   // Align the pointer arguments to this call if the target thinks it's a good
   // idea (generally only useful for memcpy/memmove/memset).
-    for (auto &Arg : CI->args()) {
-      // We want to align both objects whose address is used directly and
-      // objects whose address is used in casts and GEPs, though it only makes
-      // sense for GEPs if the offset is a multiple of the desired alignment and
-      // if size - offset meets the size threshold.
-      if (!Arg->getType()->isPointerTy())
-        continue;
+  for (auto &Arg : CI->args()) {
+    // We want to align both objects whose address is used directly and
+    // objects whose address is used in casts and GEPs, though it only makes
+    // sense for GEPs if the offset is a multiple of the desired alignment and
+    // if size - offset meets the size threshold.
+    if (!Arg->getType()->isPointerTy())
+      continue;
     unsigned MinSize;
     Align PrefAlign;
-    if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+    if (TLI->shouldUpdatePointerArgAlignment(CI, Arg, MinSize, PrefAlign)) {
       APInt Offset(DL->getIndexSizeInBits(
                        cast<PointerType>(Arg->getType())->getAddressSpace()),
                    0);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
@@ -946,6 +947,39 @@
   return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
 }
 
+bool TargetLoweringBase::shouldUpdatePointerArgAlignment(
+    const CallInst *CI, const Value *Arg, unsigned &MinSize,
+    Align &PrefAlign) const {
+  // For now, we only adjust alignment for memcpy/memmove/memset calls.
+  auto *MemCI = dyn_cast<MemIntrinsic>(CI);
+  if (!MemCI)
+    return false;
+  // When building with -Oz, we only increase the alignment if the object is
+  // at least 8 bytes in size to avoid increased stack/global padding.
+  MinSize = CI->getFunction()->hasMinSize() ? 8 : 1;
+  auto AddrSpace = MemCI->getDestAddressSpace();
+  const DataLayout &DL = CI->getModule()->getDataLayout();
+  auto PointerSize = DL.getPointerSize(AddrSpace);
+
+  // We assume that loads/stores of values aligned to pointer size are fast.
+  PrefAlign = Align(PointerSize);
+  if (Arg->getPointerAlignment(DL) > PrefAlign)
+    return false; // already aligned, not need to update it.
+
+  // XXX: we could determine the MachineMemOperand flags instead of assuming
+  bool FastUnalignedAccess = false;
+  // load+store (but it probably makes no difference for supported targets).
+  if (allowsMisalignedMemoryAccesses(llvm::LLT::pointer(AddrSpace, PointerSize),
+                                     AddrSpace, Arg->getPointerAlignment(DL),
+                                     MachineMemOperand::MOStore |
+                                         MachineMemOperand::MOLoad,
+                                     &FastUnalignedAccess)) {
+    // If unaligned loads&stores are fast, there is no need to adjust alignment.
+    return !FastUnalignedAccess;
+  }
+  return true; // unaligned accesses are not possible or slow.
+}
+
 void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) {
   // If the command-line option was specified, ignore this request.
   if (!JumpIsExpensiveOverride.getNumOccurrences())
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -572,8 +572,9 @@
     const TargetRegisterClass *
     getRegClassFor(MVT VT, bool isDivergent = false) const override;
 
-    bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
-                                Align &PrefAlign) const override;
+    bool shouldUpdatePointerArgAlignment(const CallInst *CI, const Value *Arg,
+                                         unsigned &MinSize,
+                                         Align &PrefAlign) const override;
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1919,11 +1919,12 @@
 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
 // source/dest is aligned and the copy size is large enough. We therefore want
 // to align such objects passed to memory intrinsics.
-bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
-                                               Align &PrefAlign) const {
+bool ARMTargetLowering::shouldUpdatePointerArgAlignment(
+    const CallInst *CI, const Value *Arg, unsigned &MinSize,
+    Align &PrefAlign) const {
   if (!isa<MemIntrinsic>(CI))
     return false;
-  MinSize = 8;
+  MinSize = 8; // TODO: should this depend on -Oz?
   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
   // cycle faster than 4-byte aligned LDM.
   PrefAlign =
diff --git a/llvm/test/CodeGen/BPF/rodata_5.ll b/llvm/test/CodeGen/BPF/rodata_5.ll
--- a/llvm/test/CodeGen/BPF/rodata_5.ll
+++ b/llvm/test/CodeGen/BPF/rodata_5.ll
@@ -35,8 +35,8 @@
 }
 ; CHECK-NOT:    w{{[0-9]+}} = *(u16 *)
 ; CHECK-NOT:    w{{[0-9]+}} = *(u8 *)
-; CHECK:        *(u16 *)(r10 - 4) = w{{[0-9]+}}
-; CHECK:        *(u8 *)(r10 - 2) = w{{[0-9]+}}
+; CHECK:        *(u16 *)(r10 - 8) = w{{[0-9]+}}
+; CHECK:        *(u8 *)(r10 - 6) = w{{[0-9]+}}
 
 ; Function Attrs: argmemonly nounwind willreturn
 declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
diff --git a/llvm/test/CodeGen/RISCV/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/memcpy-inline.ll
--- a/llvm/test/CodeGen/RISCV/memcpy-inline.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy-inline.ll
@@ -295,50 +295,35 @@
 }
 
 define void @t6() nounwind {
-; RV32ALIGNED-LABEL: t6:
-; RV32ALIGNED:       # %bb.0: # %entry
-; RV32ALIGNED-NEXT:    addi sp, sp, -16
-; RV32ALIGNED-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32ALIGNED-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32ALIGNED-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV32ALIGNED-NEXT:    lui a1, %hi(.L.str6)
-; RV32ALIGNED-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV32ALIGNED-NEXT:    li a2, 14
-; RV32ALIGNED-NEXT:    call memcpy@plt
-; RV32ALIGNED-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32ALIGNED-NEXT:    addi sp, sp, 16
-; RV32ALIGNED-NEXT:    ret
+; RV32-LABEL: t6:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lui a0, %hi(spool.splbuf)
+; RV32-NEXT:    li a1, 88
+; RV32-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
+; RV32-NEXT:    lui a1, 361862
+; RV32-NEXT:    addi a1, a1, -1960
+; RV32-NEXT:    sw a1, %lo(spool.splbuf+8)(a0)
+; RV32-NEXT:    lui a1, 362199
+; RV32-NEXT:    addi a1, a1, 559
+; RV32-NEXT:    sw a1, %lo(spool.splbuf+4)(a0)
+; RV32-NEXT:    lui a1, 460503
+; RV32-NEXT:    addi a1, a1, 1071
+; RV32-NEXT:    sw a1, %lo(spool.splbuf)(a0)
+; RV32-NEXT:    ret
 ;
 ; RV64ALIGNED-LABEL: t6:
 ; RV64ALIGNED:       # %bb.0: # %entry
-; RV64ALIGNED-NEXT:    addi sp, sp, -16
-; RV64ALIGNED-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ALIGNED-NEXT:    lui a0, %hi(spool.splbuf)
-; RV64ALIGNED-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV64ALIGNED-NEXT:    lui a1, %hi(.L.str6)
-; RV64ALIGNED-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV64ALIGNED-NEXT:    li a2, 14
-; RV64ALIGNED-NEXT:    call memcpy@plt
-; RV64ALIGNED-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64ALIGNED-NEXT:    addi sp, sp, 16
+; RV64ALIGNED-NEXT:    li a1, 88
+; RV64ALIGNED-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
+; RV64ALIGNED-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV64ALIGNED-NEXT:    ld a1, %lo(.LCPI6_0)(a1)
+; RV64ALIGNED-NEXT:    lui a2, 361862
+; RV64ALIGNED-NEXT:    addiw a2, a2, -1960
+; RV64ALIGNED-NEXT:    sw a2, %lo(spool.splbuf+8)(a0)
+; RV64ALIGNED-NEXT:    sd a1, %lo(spool.splbuf)(a0)
 ; RV64ALIGNED-NEXT:    ret
 ;
-; RV32UNALIGNED-LABEL: t6:
-; RV32UNALIGNED:       # %bb.0: # %entry
-; RV32UNALIGNED-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32UNALIGNED-NEXT:    li a1, 88
-; RV32UNALIGNED-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
-; RV32UNALIGNED-NEXT:    lui a1, 361862
-; RV32UNALIGNED-NEXT:    addi a1, a1, -1960
-; RV32UNALIGNED-NEXT:    sw a1, %lo(spool.splbuf+8)(a0)
-; RV32UNALIGNED-NEXT:    lui a1, 362199
-; RV32UNALIGNED-NEXT:    addi a1, a1, 559
-; RV32UNALIGNED-NEXT:    sw a1, %lo(spool.splbuf+4)(a0)
-; RV32UNALIGNED-NEXT:    lui a1, 460503
-; RV32UNALIGNED-NEXT:    addi a1, a1, 1071
-; RV32UNALIGNED-NEXT:    sw a1, %lo(spool.splbuf)(a0)
-; RV32UNALIGNED-NEXT:    ret
-;
 ; RV64UNALIGNED-LABEL: t6:
 ; RV64UNALIGNED:       # %bb.0: # %entry
 ; RV64UNALIGNED-NEXT:    lui a0, %hi(.L.str6)
diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
--- a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
+++ b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
@@ -160,7 +160,7 @@
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
@@ -178,7 +178,7 @@
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
@@ -196,7 +196,7 @@
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll b/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
@@ -18,7 +18,7 @@
   ; ALL:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.retval
   ; ALL:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.f
   ; ALL:   G_STORE [[COPY]](s32), [[FRAME_INDEX1]](p0) :: (store (s32) into %ir.coerce.dive2)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.0, align 4), (load (s8) from %ir.1, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.0, align 8), (load (s8) from %ir.1, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s32) from %ir.coerce.dive13)
   ; ALL:   $xmm0 = COPY [[LOAD]](s32)
   ; ALL:   RET 0, implicit $xmm0
@@ -108,7 +108,7 @@
   ; ALL:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.retval
   ; ALL:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.i
   ; ALL:   G_STORE [[COPY]](s32), [[FRAME_INDEX1]](p0) :: (store (s32) into %ir.coerce.dive2)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.0, align 4), (load (s8) from %ir.1, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.0, align 8), (load (s8) from %ir.1, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s32) from %ir.coerce.dive13)
   ; ALL:   $eax = COPY [[LOAD]](s32)
   ; ALL:   RET 0, implicit $eax
@@ -134,7 +134,7 @@
   ; ALL:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.retval
   ; ALL:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.i
   ; ALL:   G_STORE [[COPY]](s64), [[FRAME_INDEX1]](p0) :: (store (s64) into %ir.0, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.1, align 4), (load (s8) from %ir.2, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.1, align 8), (load (s8) from %ir.2, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s64) from %ir.3, align 4)
   ; ALL:   $rax = COPY [[LOAD]](s64)
   ; ALL:   RET 0, implicit $rax
@@ -166,9 +166,9 @@
   ; ALL:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; ALL:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX2]], [[C1]](s64)
   ; ALL:   G_STORE [[COPY1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %ir.1)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX1]](p0), [[FRAME_INDEX2]](p0), [[C]](s64), 0 :: (store (s8) into %ir.2, align 4), (load (s8) from %ir.3, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.4, align 4), (load (s8) from %ir.5, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX3]](p0), [[FRAME_INDEX]](p0), [[C]](s64), 0 :: (store (s8) into %ir.6, align 8), (load (s8) from %ir.7, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX1]](p0), [[FRAME_INDEX2]](p0), [[C]](s64), 0 :: (store (s8) into %ir.2, align 8), (load (s8) from %ir.3, align 8)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.4, align 8), (load (s8) from %ir.5, align 8)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX3]](p0), [[FRAME_INDEX]](p0), [[C]](s64), 0 :: (store (s8) into %ir.6, align 8), (load (s8) from %ir.7, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX3]](p0) :: (dereferenceable load (s64) from %ir.tmp)
   ; ALL:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX3]], [[C1]](s64)
   ; ALL:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s32) from %ir.tmp + 8, align 8)
@@ -210,7 +210,7 @@
   ; ALL:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; ALL:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX1]], [[C1]](s64)
   ; ALL:   G_STORE [[COPY1]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.2, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.3, align 4), (load (s8) from %ir.4, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.3, align 8), (load (s8) from %ir.4, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s64) from %ir.5, align 4)
   ; ALL:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64)
   ; ALL:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s64) from %ir.5 + 8, align 4)
diff --git a/llvm/test/CodeGen/X86/load-local-v4i5.ll b/llvm/test/CodeGen/X86/load-local-v4i5.ll
--- a/llvm/test/CodeGen/X86/load-local-v4i5.ll
+++ b/llvm/test/CodeGen/X86/load-local-v4i5.ll
@@ -7,10 +7,10 @@
 ; CHECK-LABEL: _start:
 ; CHECK:       # %bb.0: # %Entry
 ; CHECK-NEXT:    movl __unnamed_1(%rip), %eax
-; CHECK-NEXT:    movl %eax, -12(%rsp)
-; CHECK-NEXT:    movzbl -9(%rsp), %ecx
-; CHECK-NEXT:    movzbl -10(%rsp), %edx
-; CHECK-NEXT:    movzbl -11(%rsp), %esi
+; CHECK-NEXT:    movl %eax, -16(%rsp)
+; CHECK-NEXT:    movzbl -13(%rsp), %ecx
+; CHECK-NEXT:    movzbl -14(%rsp), %edx
+; CHECK-NEXT:    movzbl -15(%rsp), %esi
 ; CHECK-NEXT:    movzbl %cl, %edi
 ; CHECK-NEXT:    shrb %cl
 ; CHECK-NEXT:    movb %cl, -2(%rsp)
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/adjust-memintrin-alignment.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/adjust-memintrin-alignment.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/adjust-memintrin-alignment.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --force-update
+; RUN: opt -mtriple=riscv64 -data-layout="e-m:e-p:32:32" -S -codegenprepare < %s \
+; RUN:   | FileCheck %s '-D#NEW_ALIGNMENT=4'
+; RUN: opt -mtriple=riscv32 -data-layout="e-m:e-p:64:64" -S -codegenprepare < %s \
+; RUN:   | FileCheck %s '-D#NEW_ALIGNMENT=8'
+
+@str = private unnamed_addr constant [45 x i8] c"THIS IS A LONG STRING THAT SHOULD BE ALIGNED\00", align 1
+
+
+declare void @use(ptr %arg)
+
+
+; CHECK: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [45 x i8] c"THIS IS A LONG STRING THAT SHOULD BE ALIGNED\00", align [[#NEW_ALIGNMENT]]
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST:%.*]] = alloca [45 x i8], align [[#NEW_ALIGNMENT]]
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i32(ptr align [[#NEW_ALIGNMENT]] [[DST]], ptr align [[#NEW_ALIGNMENT]] dereferenceable(31) @str, i32 31, i1 false)
+; CHECK-NEXT:    ret void
+
+entry:
+  %dst = alloca [45 x i8], align 1
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %dst, ptr align 1 dereferenceable(31) @str, i32 31, i1 false)
+  ; Ensure the alloca is used to avoid it being optimized out
+  ; call void @use(%dst)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1)