diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2220,17 +2220,17 @@
   }
 
   // Align the pointer arguments to this call if the target thinks it's a good
-  // idea
-  unsigned MinSize;
-  Align PrefAlign;
-  if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
-    for (auto &Arg : CI->args()) {
-      // We want to align both objects whose address is used directly and
-      // objects whose address is used in casts and GEPs, though it only makes
-      // sense for GEPs if the offset is a multiple of the desired alignment and
-      // if size - offset meets the size threshold.
-      if (!Arg->getType()->isPointerTy())
-        continue;
+  // idea (generally only useful for memcpy/memmove/memset).
+  for (auto &Arg : CI->args()) {
+    // We want to align both objects whose address is used directly and
+    // objects whose address is used in casts and GEPs, though it only makes
+    // sense for GEPs if the offset is a multiple of the desired alignment and
+    // if size - offset meets the size threshold.
+    if (!Arg->getType()->isPointerTy())
+      continue;
+    unsigned MinSize;
+    Align PrefAlign;
+    if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
       APInt Offset(DL->getIndexSizeInBits(
                        cast<PointerType>(Arg->getType())->getAddressSpace()),
                    0);
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -19,16 +19,16 @@
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    ds_read_u8 v1, v0 offset:1
+; GCN-NEXT:    v_mov_b32_e32 v1, 2
+; GCN-NEXT:    ds_write_b8 v0, v1
 ; GCN-NEXT:    ds_read_u8 v2, v0 offset:2
+; GCN-NEXT:    ds_read_u16 v3, v0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v3, 2
-; GCN-NEXT:    ds_write_b8 v0, v3
-; GCN-NEXT:    ds_write_b8 v0, v3 offset:4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ds_write_b8 v0, v1 offset:5
 ; GCN-NEXT:    ds_write_b8 v0, v2 offset:6
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    ds_write_b16 v0, v3 offset:4
+; GCN-NEXT:    v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; GCN-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ; CHECK-LABEL: @test(
@@ -53,7 +53,6 @@
 ; CHECK-NEXT:    [[FROMBOOL8:%.*]] = zext i1 [[TMP10]] to i8
 ; CHECK-NEXT:    store i8 [[FROMBOOL8]], i8 addrspace(1)* [[PTR_COERCE:%.*]], align 1
 ; CHECK-NEXT:    ret void
-;
 entry:
   store i8 3, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1
   tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i64 3, i1 false)
diff --git a/llvm/test/CodeGen/X86/mcu-abi.ll b/llvm/test/CodeGen/X86/mcu-abi.ll
--- a/llvm/test/CodeGen/X86/mcu-abi.ll
+++ b/llvm/test/CodeGen/X86/mcu-abi.ll
@@ -64,13 +64,14 @@
 define void @ret_large_struct(ptr noalias nocapture sret(%struct.st12_t) %agg.result, ptr byval(%struct.st12_t) nocapture readonly align 4 %r) #0 {
 ; CHECK-LABEL: ret_large_struct:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    movl %eax, %esi
-; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl $48, %ecx
-; CHECK-NEXT:    calll memcpy
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl $12, %ecx
+; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
 ; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 entry:
   call void @llvm.memcpy.p0.p0.i32(ptr %agg.result, ptr %r, i32 48, i1 false)
diff --git a/llvm/test/CodeGen/X86/memset-2.ll b/llvm/test/CodeGen/X86/memset-2.ll
--- a/llvm/test/CodeGen/X86/memset-2.ll
+++ b/llvm/test/CodeGen/X86/memset-2.ll
@@ -1,31 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s | FileCheck %s
 
-define fastcc void @t1() nounwind {
+define fastcc void @t1(ptr nocapture %s) nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    pushl $188
 ; CHECK-NEXT:    pushl $0
-; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $16, %esp
 ; CHECK-NEXT:    ud2
 entry:
-  call void @llvm.memset.p0.i32(ptr null, i8 0, i32 188, i1 false)
+  call void @llvm.memset.p0.i32(ptr %s, i8 0, i32 188, i1 false)
   unreachable
 }
 
-define fastcc void @t2(i8 signext %c) nounwind {
+define fastcc void @t2(ptr nocapture %s, i8 signext %c) nounwind {
 ; CHECK-LABEL: t2:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    subl $12, %esp
-; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $76, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    subl $16, %esp
+; CHECK-NEXT:    pushl $76
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _memset
+; CHECK-NEXT:    addl $16, %esp
 ; CHECK-NEXT:    ud2
 entry:
-  call void @llvm.memset.p0.i32(ptr undef, i8 %c, i32 76, i1 false)
+  call void @llvm.memset.p0.i32(ptr %s, i8 %c, i32 76, i1 false)
   unreachable
 }
 
diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
--- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -3,55 +3,57 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown   -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_64
 
-define void @bork() nounwind {
+define void @bork(ptr nocapture align 4 %dst) nounwind {
 ; FAST-LABEL: bork:
 ; FAST:       # %bb.0:
+; FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FAST-NEXT:    xorps %xmm0, %xmm0
-; FAST-NEXT:    movups %xmm0, 64
-; FAST-NEXT:    movups %xmm0, 48
-; FAST-NEXT:    movups %xmm0, 32
-; FAST-NEXT:    movups %xmm0, 16
-; FAST-NEXT:    movups %xmm0, 0
+; FAST-NEXT:    movups %xmm0, 64(%eax)
+; FAST-NEXT:    movups %xmm0, 48(%eax)
+; FAST-NEXT:    movups %xmm0, 32(%eax)
+; FAST-NEXT:    movups %xmm0, 16(%eax)
+; FAST-NEXT:    movups %xmm0, (%eax)
 ; FAST-NEXT:    retl
 ;
 ; SLOW_32-LABEL: bork:
 ; SLOW_32:       # %bb.0:
-; SLOW_32-NEXT:    movl $0, 4
-; SLOW_32-NEXT:    movl $0, 0
-; SLOW_32-NEXT:    movl $0, 12
-; SLOW_32-NEXT:    movl $0, 8
-; SLOW_32-NEXT:    movl $0, 20
-; SLOW_32-NEXT:    movl $0, 16
-; SLOW_32-NEXT:    movl $0, 28
-; SLOW_32-NEXT:    movl $0, 24
-; SLOW_32-NEXT:    movl $0, 36
-; SLOW_32-NEXT:    movl $0, 32
-; SLOW_32-NEXT:    movl $0, 44
-; SLOW_32-NEXT:    movl $0, 40
-; SLOW_32-NEXT:    movl $0, 52
-; SLOW_32-NEXT:    movl $0, 48
-; SLOW_32-NEXT:    movl $0, 60
-; SLOW_32-NEXT:    movl $0, 56
-; SLOW_32-NEXT:    movl $0, 68
-; SLOW_32-NEXT:    movl $0, 64
-; SLOW_32-NEXT:    movl $0, 76
-; SLOW_32-NEXT:    movl $0, 72
+; SLOW_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SLOW_32-NEXT:    movl $0, 4(%eax)
+; SLOW_32-NEXT:    movl $0, (%eax)
+; SLOW_32-NEXT:    movl $0, 12(%eax)
+; SLOW_32-NEXT:    movl $0, 8(%eax)
+; SLOW_32-NEXT:    movl $0, 20(%eax)
+; SLOW_32-NEXT:    movl $0, 16(%eax)
+; SLOW_32-NEXT:    movl $0, 28(%eax)
+; SLOW_32-NEXT:    movl $0, 24(%eax)
+; SLOW_32-NEXT:    movl $0, 36(%eax)
+; SLOW_32-NEXT:    movl $0, 32(%eax)
+; SLOW_32-NEXT:    movl $0, 44(%eax)
+; SLOW_32-NEXT:    movl $0, 40(%eax)
+; SLOW_32-NEXT:    movl $0, 52(%eax)
+; SLOW_32-NEXT:    movl $0, 48(%eax)
+; SLOW_32-NEXT:    movl $0, 60(%eax)
+; SLOW_32-NEXT:    movl $0, 56(%eax)
+; SLOW_32-NEXT:    movl $0, 68(%eax)
+; SLOW_32-NEXT:    movl $0, 64(%eax)
+; SLOW_32-NEXT:    movl $0, 76(%eax)
+; SLOW_32-NEXT:    movl $0, 72(%eax)
 ; SLOW_32-NEXT:    retl
 ;
 ; SLOW_64-LABEL: bork:
 ; SLOW_64:       # %bb.0:
-; SLOW_64-NEXT:    movq $0, 72
-; SLOW_64-NEXT:    movq $0, 64
-; SLOW_64-NEXT:    movq $0, 56
-; SLOW_64-NEXT:    movq $0, 48
-; SLOW_64-NEXT:    movq $0, 40
-; SLOW_64-NEXT:    movq $0, 32
-; SLOW_64-NEXT:    movq $0, 24
-; SLOW_64-NEXT:    movq $0, 16
-; SLOW_64-NEXT:    movq $0, 8
-; SLOW_64-NEXT:    movq $0, 0
+; SLOW_64-NEXT:    movq $0, 72(%rdi)
+; SLOW_64-NEXT:    movq $0, 64(%rdi)
+; SLOW_64-NEXT:    movq $0, 56(%rdi)
+; SLOW_64-NEXT:    movq $0, 48(%rdi)
+; SLOW_64-NEXT:    movq $0, 40(%rdi)
+; SLOW_64-NEXT:    movq $0, 32(%rdi)
+; SLOW_64-NEXT:    movq $0, 24(%rdi)
+; SLOW_64-NEXT:    movq $0, 16(%rdi)
+; SLOW_64-NEXT:    movq $0, 8(%rdi)
+; SLOW_64-NEXT:    movq $0, (%rdi)
 ; SLOW_64-NEXT:    retq
-  call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 80, i1 false)
+  call void @llvm.memset.p0.i64(ptr align 4 %dst, i8 0, i64 80, i1 false)
   ret void
 }
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll b/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll
--- a/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
 ; RUN: opt -S -mtriple=x86_64 -disable-simplify-libcalls -codegenprepare < %s | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -10,8 +10,9 @@
 ; - TLI::has (always returns false thanks to -disable-simplify-libcalls)
 
 define void @test_nobuiltin(i8* %dst, i64 %len) {
-; CHECK-LABEL: @test_nobuiltin(
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) #1
+; CHECK-LABEL: define {{[^@]+}}@test_nobuiltin
+; CHECK-SAME: (i8* [[DST:%.*]], i64 [[LEN:%.*]]) {
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 0, i64 [[LEN]], i1 false)
 ; CHECK-NEXT:    ret void
 ;
   call i8* @__memset_chk(i8* %dst, i32 0, i64 %len, i64 -1) nobuiltin