diff --git a/llvm/test/CodeGen/X86/byval.ll b/llvm/test/CodeGen/X86/byval.ll
--- a/llvm/test/CodeGen/X86/byval.ll
+++ b/llvm/test/CodeGen/X86/byval.ll
@@ -1,15 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck -check-prefix=X86-64 %s
 ; Win64 has not supported byval yet.
 ; RUN: llc < %s -mtriple=i686-- | FileCheck -check-prefix=X86 %s
 
-; X86: movl	4(%esp), %eax
-; X86: movl	8(%esp), %edx
-
-; X86-64: movq	8(%rsp), %rax
-
 %struct.s = type { i64, i64, i64 }
 
 define i64 @f(%struct.s* byval %a) {
+; X86-64-LABEL: f:
+; X86-64:       # %bb.0: # %entry
+; X86-64-NEXT:    movq 8(%rsp), %rax
+; X86-64-NEXT:    retq
+;
+; X86-LABEL: f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl 4(%esp), %eax
+; X86-NEXT:    movl 8(%esp), %edx
+; X86-NEXT:    retl
 entry:
 	%tmp2 = getelementptr %struct.s, %struct.s* %a, i32 0, i32 0
 	%tmp3 = load i64, i64* %tmp2, align 8
diff --git a/llvm/test/CodeGen/X86/byval2.ll b/llvm/test/CodeGen/X86/byval2.ll
--- a/llvm/test/CodeGen/X86/byval2.ll
+++ b/llvm/test/CodeGen/X86/byval2.ll
@@ -1,34 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -check-prefix=X64
-; X64-NOT:     movsq
-; X64:     rep
-; X64-NOT:     rep
-; X64:     movsq
-; X64-NOT:     movsq
-; X64:     rep
-; X64-NOT:     rep
-; X64:     movsq
-; X64-NOT:     rep
-; X64-NOT:     movsq
-
 ; Win64 has not supported byval yet.
-
 ; RUN: llc < %s -mtriple=i686-- -mattr=-avx | FileCheck %s -check-prefix=X32
-; X32-NOT:     movsl
-; X32:     rep
-; X32-NOT:     rep
-; X32:     movsl
-; X32-NOT:     movsl
-; X32:     rep
-; X32-NOT:     rep
-; X32:     movsl
-; X32-NOT:     rep
-; X32-NOT:     movsl
 
 %struct.s = type { i64, i64, i64, i64, i64, i64, i64, i64,
                    i64, i64, i64, i64, i64, i64, i64, i64,
                    i64 }
 
 define void @g(i64 %a, i64 %b, i64 %c) {
+; X64-LABEL: g:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    subq $288, %rsp # imm = 0x120
+; X64-NEXT:    .cfi_def_cfa_offset 304
+; X64-NEXT:    .cfi_offset %rbx, -16
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movl $17, %ecx
+; X64-NEXT:    movq %rsp, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; X64-NEXT:    callq f
+; X64-NEXT:    movl $17, %ecx
+; X64-NEXT:    movq %rsp, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; X64-NEXT:    callq f
+; X64-NEXT:    addq $288, %rsp # imm = 0x120
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
+;
+; X32-LABEL: g:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $288, %esp # imm = 0x120
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 20(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 16(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 28(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 24(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl $34, %ecx
+; X32-NEXT:    movl %esp, %edi
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; X32-NEXT:    calll f
+; X32-NEXT:    movl $34, %ecx
+; X32-NEXT:    movl %esp, %edi
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; X32-NEXT:    calll f
+; X32-NEXT:    leal -12(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa %esp, 4
+; X32-NEXT:    retl
 entry:
 	%d = alloca %struct.s, align 16
 	%tmp = getelementptr %struct.s, %struct.s* %d, i32 0, i32 0
diff --git a/llvm/test/CodeGen/X86/byval3.ll b/llvm/test/CodeGen/X86/byval3.ll
--- a/llvm/test/CodeGen/X86/byval3.ll
+++ b/llvm/test/CodeGen/X86/byval3.ll
@@ -1,28 +1,7 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -check-prefix=X64
-; X64-NOT:     movsq
-; X64:     rep
-; X64-NOT:     rep
-; X64:     movsq
-; X64-NOT:     movsq
-; X64:     rep
-; X64-NOT:     rep
-; X64:     movsq
-; X64-NOT:     rep
-; X64-NOT:     movsq
-
 ; Win64 has not supported byval yet.
-
 ; RUN: llc < %s -mtriple=i686-- -mattr=-avx | FileCheck %s -check-prefix=X32
-; X32-NOT:     movsl
-; X32:     rep
-; X32-NOT:     rep
-; X32:     movsl
-; X32-NOT:     movsl
-; X32:     rep
-; X32-NOT:     rep
-; X32:     movsl
-; X32-NOT:     rep
-; X32-NOT:     movsl
 
 %struct.s = type { i32, i32, i32, i32, i32, i32, i32, i32,
                    i32, i32, i32, i32, i32, i32, i32, i32,
@@ -31,6 +10,73 @@
                    i32 }
 
 define void @g(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6) nounwind {
+; X64-LABEL: g:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $288, %rsp # imm = 0x120
+; X64-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %esi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %edx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r8d, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %r9d, {{[0-9]+}}(%rsp)
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movl $16, %ecx
+; X64-NEXT:    movq %rsp, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    callq f
+; X64-NEXT:    movl $16, %ecx
+; X64-NEXT:    movq %rsp, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    callq f
+; X64-NEXT:    addq $288, %rsp # imm = 0x120
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+;
+; X32-LABEL: g:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $288, %esp # imm = 0x120
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 16(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 20(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 24(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 28(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl $33, %ecx
+; X32-NEXT:    movl %esp, %edi
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; X32-NEXT:    calll f
+; X32-NEXT:    movl $33, %ecx
+; X32-NEXT:    movl %esp, %edi
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; X32-NEXT:    calll f
+; X32-NEXT:    leal -12(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
 entry:
         %d = alloca %struct.s, align 16
         %tmp = getelementptr %struct.s, %struct.s* %d, i32 0, i32 0
diff --git a/llvm/test/CodeGen/X86/byval4.ll b/llvm/test/CodeGen/X86/byval4.ll
--- a/llvm/test/CodeGen/X86/byval4.ll
+++ b/llvm/test/CodeGen/X86/byval4.ll
@@ -1,28 +1,7 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -check-prefix=X64
-; X64-NOT:     movsq
-; X64:     rep
-; X64-NOT:     rep
-; X64:     movsq
-; X64-NOT:     movsq
-; X64:     rep
-; X64-NOT:     rep
-; X64:     movsq
-; X64-NOT:     rep
-; X64-NOT:     movsq
-
 ; Win64 has not supported byval yet.
-
 ; RUN: llc < %s -mtriple=i686-- -mattr=-avx | FileCheck %s -check-prefix=X32
-; X32-NOT:     movsl
-; X32:     rep
-; X32-NOT:     rep
-; X32:     movsl
-; X32-NOT:     movsl
-; X32:     rep
-; X32-NOT:     rep
-; X32:     movsl
-; X32-NOT:     rep
-; X32-NOT:     movsl
 
 %struct.s = type { i16, i16, i16, i16, i16, i16, i16, i16,
                    i16, i16, i16, i16, i16, i16, i16, i16,
@@ -36,6 +15,77 @@
 
 
 define void @g(i16 signext  %a1, i16 signext  %a2, i16 signext  %a3,
+; X64-LABEL: g:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $288, %rsp # imm = 0x120
+; X64-NEXT:    movw %di, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movw %si, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movw %r8w, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movw %r9w, {{[0-9]+}}(%rsp)
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movl $16, %ecx
+; X64-NEXT:    movq %rsp, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    callq f
+; X64-NEXT:    movl $16, %ecx
+; X64-NEXT:    movq %rsp, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    callq f
+; X64-NEXT:    addq $288, %rsp # imm = 0x120
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+;
+; X32-LABEL: g:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $288, %esp # imm = 0x120
+; X32-NEXT:    movzwl 8(%ebp), %eax
+; X32-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movzwl 12(%ebp), %eax
+; X32-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movzwl 16(%ebp), %eax
+; X32-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movzwl 20(%ebp), %eax
+; X32-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movzwl 24(%ebp), %eax
+; X32-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movzwl 28(%ebp), %eax
+; X32-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl $32, %ecx
+; X32-NEXT:    movl %esp, %edi
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X32-NEXT:    calll f
+; X32-NEXT:    movl $32, %ecx
+; X32-NEXT:    movl %esp, %edi
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X32-NEXT:    calll f
+; X32-NEXT:    leal -12(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
 	 i16 signext  %a4, i16 signext  %a5, i16 signext  %a6) nounwind {
 entry:
         %a = alloca %struct.s, align 16
diff --git a/llvm/test/CodeGen/X86/byval5.ll b/llvm/test/CodeGen/X86/byval5.ll
--- a/llvm/test/CodeGen/X86/byval5.ll
+++ b/llvm/test/CodeGen/X86/byval5.ll
@@ -1,28 +1,7 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -check-prefix=X64
-; X64-NOT:     movsq
-; X64:     rep
-; X64-NOT:     rep
-; X64:     movsq
-; X64-NOT:     movsq
-; X64:     rep
-; X64-NOT:     rep
-; X64:     movsq
-; X64-NOT:     rep
-; X64-NOT:     movsq
-
 ; Win64 has not supported byval yet.
-
 ; RUN: llc < %s -mtriple=i686-- -mattr=-avx | FileCheck %s -check-prefix=X32
-; X32-NOT:     movsl
-; X32:     rep
-; X32-NOT:     rep
-; X32:     movsl
-; X32-NOT:     movsl
-; X32:     rep
-; X32-NOT:     rep
-; X32:     movsl
-; X32-NOT:     rep
-; X32-NOT:     movsl
 
 %struct.s = type { i8, i8, i8, i8, i8, i8, i8, i8,
                    i8, i8, i8, i8, i8, i8, i8, i8,
@@ -44,6 +23,89 @@
 
 
 define void @g(i8 signext  %a1, i8 signext  %a2, i8 signext  %a3,
+; X64-LABEL: g:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    subq $272, %rsp # imm = 0x110
+; X64-NEXT:    .cfi_def_cfa_offset 288
+; X64-NEXT:    .cfi_offset %rbx, -16
+; X64-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movb %dl, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movb %cl, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movb %r9b, {{[0-9]+}}(%rsp)
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movl $16, %ecx
+; X64-NEXT:    movq %rsp, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; X64-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; X64-NEXT:    movb %al, {{[0-9]+}}(%rsp)
+; X64-NEXT:    callq f
+; X64-NEXT:    movl $16, %ecx
+; X64-NEXT:    movq %rsp, %rdi
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; X64-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; X64-NEXT:    movb %al, {{[0-9]+}}(%rsp)
+; X64-NEXT:    callq f
+; X64-NEXT:    addq $272, %rsp # imm = 0x110
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
+;
+; X32-LABEL: g:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-8, %esp
+; X32-NEXT:    subl $272, %esp # imm = 0x110
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    movb 28(%ebp), %al
+; X32-NEXT:    movb 24(%ebp), %cl
+; X32-NEXT:    movb 20(%ebp), %dl
+; X32-NEXT:    movb 16(%ebp), %ah
+; X32-NEXT:    movb 12(%ebp), %ch
+; X32-NEXT:    movb 8(%ebp), %dh
+; X32-NEXT:    movb %dh, {{[0-9]+}}(%esp)
+; X32-NEXT:    movb %ch, {{[0-9]+}}(%esp)
+; X32-NEXT:    movb %ah, {{[0-9]+}}(%esp)
+; X32-NEXT:    movb %dl, {{[0-9]+}}(%esp)
+; X32-NEXT:    movb %cl, {{[0-9]+}}(%esp)
+; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl $32, %ecx
+; X32-NEXT:    movl %esp, %edi
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT:    calll f
+; X32-NEXT:    movl $32, %ecx
+; X32-NEXT:    movl %esp, %edi
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT:    calll f
+; X32-NEXT:    leal -12(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa %esp, 4
+; X32-NEXT:    retl
 	 i8 signext  %a4, i8 signext  %a5, i8 signext  %a6) {
 entry:
         %a = alloca %struct.s
diff --git a/llvm/test/CodeGen/X86/byval6.ll b/llvm/test/CodeGen/X86/byval6.ll
--- a/llvm/test/CodeGen/X86/byval6.ll
+++ b/llvm/test/CodeGen/X86/byval6.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-- | grep add | not grep 16
 
 	%struct.W = type { x86_fp80, x86_fp80 }
@@ -6,8 +7,8 @@
 
 define i32 @main() nounwind  {
 entry:
-	tail call void (i32, ...) @bar( i32 3, %struct.W* byval  @.cpx ) nounwind 
-	tail call void (i32, ...) @baz( i32 3, %struct.W* byval  @B ) nounwind 
+	tail call void (i32, ...) @bar( i32 3, %struct.W* byval  @.cpx ) nounwind
+	tail call void (i32, ...) @baz( i32 3, %struct.W* byval  @B ) nounwind
 	ret i32 undef
 }
 
diff --git a/llvm/test/CodeGen/X86/byval7.ll b/llvm/test/CodeGen/X86/byval7.ll
--- a/llvm/test/CodeGen/X86/byval7.ll
+++ b/llvm/test/CodeGen/X86/byval7.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | FileCheck %s
 
 	%struct.S = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
@@ -5,12 +6,29 @@
                            <2 x i64> }
 
 define i32 @main() nounwind  {
-entry:
 ; CHECK-LABEL: main:
-; CHECK: leal 16(%esp), %edi
-; CHECK: leal 160(%esp), %esi
-; CHECK: rep;movsl
-; CHECK: movl $1, (%esp)
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $304, %esp # imm = 0x130
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [3,2,1,0]
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl $36, %ecx
+; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
+; CHECK-NEXT:    movl $1, (%esp)
+; CHECK-NEXT:    calll t
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    leal -8(%ebp), %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+entry:
 	%s = alloca %struct.S		; <%struct.S*> [#uses=2]
 	%tmp15 = getelementptr %struct.S, %struct.S* %s, i32 0, i32 0		; <<2 x i64>*> [#uses=1]
 	store <2 x i64> < i64 8589934595, i64 1 >, <2 x i64>* %tmp15, align 16
diff --git a/llvm/test/CodeGen/X86/mcu-abi.ll b/llvm/test/CodeGen/X86/mcu-abi.ll
--- a/llvm/test/CodeGen/X86/mcu-abi.ll
+++ b/llvm/test/CodeGen/X86/mcu-abi.ll
@@ -1,13 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-pc-elfiamcu | FileCheck %s
 
 %struct.st12_t = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
 
-; CHECK-LABEL: test_ints:
-; CHECK: addl    %edx, %eax
-; CHECK-NEXT: imull   %ecx, %eax
-; CHECK-NEXT: addl    4(%esp), %eax
-; CHECK-NEXT: retl
 define i32 @test_ints(i32 %a, i32 %b, i32 %c, i32 %d) #0 {
+; CHECK-LABEL: test_ints:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    imull %ecx, %eax
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    retl
 entry:
   %r1 = add i32 %b, %a
   %r2 = mul i32 %c, %r1
@@ -15,12 +17,13 @@
   ret i32 %r3
 }
 
-; CHECK-LABEL: test_floats:
-; CHECK: addl    %edx, %eax
-; CHECK-NEXT: imull   %ecx, %eax
-; CHECK-NEXT: addl    4(%esp), %eax
-; CHECK-NEXT: retl
 define i32 @test_floats(i32 %a, i32 %b, float %c, float %d) #0 {
+; CHECK-LABEL: test_floats:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    imull %ecx, %eax
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    retl
 entry:
   %ci = bitcast float %c to i32
   %di = bitcast float %d to i32
@@ -30,11 +33,12 @@
   ret i32 %r3
 }
 
-; CHECK-LABEL: test_doubles:
-; CHECK: addl    4(%esp), %eax
-; CHECK-NEXT: adcl    8(%esp), %edx
-; CHECK-NEXT: retl
 define double @test_doubles(double %d1, double %d2) #0 {
+; CHECK-LABEL: test_doubles:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    retl
 entry:
     %d1i = bitcast double %d1 to i64
     %d2i = bitcast double %d2 to i64
@@ -43,11 +47,12 @@
     ret double %rd
 }
 
-; CHECK-LABEL: test_mixed_doubles:
-; CHECK: addl    %ecx, %eax
-; CHECK-NEXT: adcl    $0, %edx
-; CHECK-NEXT: retl
 define double @test_mixed_doubles(double %d2, i32 %i) #0 {
+; CHECK-LABEL: test_mixed_doubles:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    adcl $0, %edx
+; CHECK-NEXT:    retl
 entry:
     %iext = zext i32 %i to i64
     %d2i = bitcast double %d2 to i64
@@ -56,17 +61,17 @@
     ret double %rd
 }
 
-; CHECK-LABEL: ret_large_struct:
-; CHECK: pushl   %esi
-; CHECK-NEXT: movl    %eax, %esi
-; CHECK-NEXT: leal    8(%esp), %edx
-; CHECK-NEXT: movl    $48, %ecx
-; CHECK-NEXT: calll   memcpy
-; CHECK-NEXT: movl    %esi, %eax
-; CHECK-NEXT: popl    %esi
-; CHECK-NOT:  retl $4
-; CHECK-NEXT: retl
 define void @ret_large_struct(%struct.st12_t* noalias nocapture sret %agg.result, %struct.st12_t* byval nocapture readonly align 4 %r) #0 {
+; CHECK-LABEL: ret_large_struct:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl $48, %ecx
+; CHECK-NEXT:    calll memcpy
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
 entry:
   %0 = bitcast %struct.st12_t* %agg.result to i8*
   %1 = bitcast %struct.st12_t* %r to i8*
@@ -74,31 +79,37 @@
   ret void
 }
 
-; CHECK-LABEL: var_args:
-; CHECK: movl    4(%esp), %eax
-; CHECK-NEXT: retl
 define i32 @var_args(i32 %i1, ...) #0 {
+; CHECK-LABEL: var_args:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    retl
 entry:
   ret i32 %i1
 }
 
 %struct.S = type { i8 }
 
-; CHECK-LABEL: test_lib_args:
-; CHECK: movl %edx, %eax
-; CHECK: calll __fixsfsi
 define i32 @test_lib_args(float %a, float %b) #0 {
+; CHECK-LABEL: test_lib_args:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    calll __fixsfsi
+; CHECK-NEXT:    retl
   %ret = fptosi float %b to i32
   ret i32 %ret
 }
 
-; CHECK-LABEL: test_fp128:
-; CHECK:      pushl   12(%eax)
-; CHECK-NEXT: pushl   8(%eax)
-; CHECK-NEXT: pushl   4(%eax)
-; CHECK-NEXT: pushl   (%eax)
-; CHECK-NEXT: calll   __fixtfsi
 define i32 @test_fp128(fp128* %ptr) #0 {
+; CHECK-LABEL: test_fp128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl 12(%eax)
+; CHECK-NEXT:    pushl 8(%eax)
+; CHECK-NEXT:    pushl 4(%eax)
+; CHECK-NEXT:    pushl (%eax)
+; CHECK-NEXT:    calll __fixtfsi
+; CHECK-NEXT:    addl $16, %esp
+; CHECK-NEXT:    retl
   %v = load fp128, fp128* %ptr
   %ret = fptosi fp128 %v to i32
   ret i32 %ret
@@ -106,39 +117,65 @@
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #1
 
-; CHECK-LABEL: test_alignment_d:
-; CHECK-NOT: andl  {{.+}}, %esp
 define void @test_alignment_d() #0 {
+; CHECK-LABEL: test_alignment_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    movl $1073741824, {{[0-9]+}}(%esp) # imm = 0x40000000
+; CHECK-NEXT:    movl $0, (%esp)
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    calll food
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    retl
 entry:
   %d = alloca double
   store double 2.000000e+00, double* %d
-  call void @food(double* inreg %d) 
+  call void @food(double* inreg %d)
   ret void
 }
 
-; CHECK-LABEL: test_alignment_i:
-; CHECK-NOT: andl  {{.+}}, %esp
 define void @test_alignment_i() #0 {
+; CHECK-LABEL: test_alignment_i:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $2, (%esp)
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    calll fooi
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    retl
 entry:
   %i = alloca i64
   store i64 2, i64* %i
-  call void @fooi(i64* inreg %i) 
+  call void @fooi(i64* inreg %i)
   ret void
 }
 
-
-; CHECK-LABEL: test_alignment_s:
-; CHECK-NOT: andl  {{.+}}, %esp
 define void @test_alignment_s() #0 {
+; CHECK-LABEL: test_alignment_s:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    calll foos
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
   %s = alloca %struct.S, align 4
-  call void @foos(%struct.S* inreg %s) 
+  call void @foos(%struct.S* inreg %s)
   ret void
 }
 
-
-; CHECK-LABEL: test_alignment_fp:
-; CHECK-NOT: andl  {{.+}}, %esp
 define void @test_alignment_fp() #0 {
+; CHECK-LABEL: test_alignment_fp:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subl $16, %esp
+; CHECK-NEXT:    movl $1073741824, {{[0-9]+}}(%esp) # imm = 0x40000000
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, (%esp)
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    calll foofp
+; CHECK-NEXT:    addl $16, %esp
+; CHECK-NEXT:    retl
 entry:
   %f = alloca fp128
   store fp128 0xL00000000000000004000000000000000, fp128* %f
diff --git a/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll b/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
--- a/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
+++ b/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
@@ -1,7 +1,8 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i686-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST32
+; RUN: llc -mtriple=i686-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST32
 ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST
 ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
-; RUN: llc -mtriple=i686-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST32
-; RUN: llc -mtriple=i686-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
 ; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=generic < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST
 ; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=haswell < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
 ; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skylake < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
@@ -14,23 +15,117 @@
 declare void @foo(%struct.large* align 8 byval) nounwind
 
 define void @test1(%struct.large* nocapture %x) nounwind {
+; NOFAST32-LABEL: test1:
+; NOFAST32:       # %bb.0:
+; NOFAST32-NEXT:    pushl %edi
+; NOFAST32-NEXT:    pushl %esi
+; NOFAST32-NEXT:    subl $4100, %esp # imm = 0x1004
+; NOFAST32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOFAST32-NEXT:    movl $1024, %ecx # imm = 0x400
+; NOFAST32-NEXT:    movl %esp, %edi
+; NOFAST32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; NOFAST32-NEXT:    calll foo
+; NOFAST32-NEXT:    addl $4100, %esp # imm = 0x1004
+; NOFAST32-NEXT:    popl %esi
+; NOFAST32-NEXT:    popl %edi
+; NOFAST32-NEXT:    retl
+;
+; FAST32-LABEL: test1:
+; FAST32:       # %bb.0:
+; FAST32-NEXT:    pushl %edi
+; FAST32-NEXT:    pushl %esi
+; FAST32-NEXT:    subl $4100, %esp # imm = 0x1004
+; FAST32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FAST32-NEXT:    movl $4096, %ecx # imm = 0x1000
+; FAST32-NEXT:    movl %esp, %edi
+; FAST32-NEXT:    rep;movsb (%esi), %es:(%edi)
+; FAST32-NEXT:    calll foo
+; FAST32-NEXT:    addl $4100, %esp # imm = 0x1004
+; FAST32-NEXT:    popl %esi
+; FAST32-NEXT:    popl %edi
+; FAST32-NEXT:    retl
+;
+; NOFAST-LABEL: test1:
+; NOFAST:       # %bb.0:
+; NOFAST-NEXT:    subq $4104, %rsp # imm = 0x1008
+; NOFAST-NEXT:    movq %rdi, %rsi
+; NOFAST-NEXT:    movl $512, %ecx # imm = 0x200
+; NOFAST-NEXT:    movq %rsp, %rdi
+; NOFAST-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; NOFAST-NEXT:    callq foo
+; NOFAST-NEXT:    addq $4104, %rsp # imm = 0x1008
+; NOFAST-NEXT:    retq
+;
+; FAST-LABEL: test1:
+; FAST:       # %bb.0:
+; FAST-NEXT:    subq $4104, %rsp # imm = 0x1008
+; FAST-NEXT:    movq %rdi, %rsi
+; FAST-NEXT:    movl $4096, %ecx # imm = 0x1000
+; FAST-NEXT:    movq %rsp, %rdi
+; FAST-NEXT:    rep;movsb (%rsi), %es:(%rdi)
+; FAST-NEXT:    callq foo
+; FAST-NEXT:    addq $4104, %rsp # imm = 0x1008
+; FAST-NEXT:    retq
   call void @foo(%struct.large* align 8 byval %x)
   ret void
 
-; ALL-LABEL: test1:
-; NOFAST: rep;movsq
-; NOFAST32: rep;movsl
-; FAST: rep;movsb
 }
 
 define void @test2(%struct.large* nocapture %x) nounwind minsize {
+; NOFAST32-LABEL: test2:
+; NOFAST32:       # %bb.0:
+; NOFAST32-NEXT:    pushl %edi
+; NOFAST32-NEXT:    pushl %esi
+; NOFAST32-NEXT:    subl $4100, %esp # imm = 0x1004
+; NOFAST32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOFAST32-NEXT:    movl $1024, %ecx # imm = 0x400
+; NOFAST32-NEXT:    movl %esp, %edi
+; NOFAST32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; NOFAST32-NEXT:    calll foo
+; NOFAST32-NEXT:    addl $4100, %esp # imm = 0x1004
+; NOFAST32-NEXT:    popl %esi
+; NOFAST32-NEXT:    popl %edi
+; NOFAST32-NEXT:    retl
+;
+; FAST32-LABEL: test2:
+; FAST32:       # %bb.0:
+; FAST32-NEXT:    pushl %edi
+; FAST32-NEXT:    pushl %esi
+; FAST32-NEXT:    subl $4100, %esp # imm = 0x1004
+; FAST32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FAST32-NEXT:    movl $4096, %ecx # imm = 0x1000
+; FAST32-NEXT:    movl %esp, %edi
+; FAST32-NEXT:    rep;movsb (%esi), %es:(%edi)
+; FAST32-NEXT:    calll foo
+; FAST32-NEXT:    addl $4100, %esp # imm = 0x1004
+; FAST32-NEXT:    popl %esi
+; FAST32-NEXT:    popl %edi
+; FAST32-NEXT:    retl
+;
+; NOFAST-LABEL: test2:
+; NOFAST:       # %bb.0:
+; NOFAST-NEXT:    subq $4104, %rsp # imm = 0x1008
+; NOFAST-NEXT:    movq %rdi, %rsi
+; NOFAST-NEXT:    movl $512, %ecx # imm = 0x200
+; NOFAST-NEXT:    movq %rsp, %rdi
+; NOFAST-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; NOFAST-NEXT:    callq foo
+; NOFAST-NEXT:    addq $4104, %rsp # imm = 0x1008
+; NOFAST-NEXT:    retq
+;
+; FAST-LABEL: test2:
+; FAST:       # %bb.0:
+; FAST-NEXT:    subq $4104, %rsp # imm = 0x1008
+; FAST-NEXT:    movq %rdi, %rsi
+; FAST-NEXT:    movl $4096, %ecx # imm = 0x1000
+; FAST-NEXT:    movq %rsp, %rdi
+; FAST-NEXT:    rep;movsb (%rsi), %es:(%rdi)
+; FAST-NEXT:    callq foo
+; FAST-NEXT:    addq $4104, %rsp # imm = 0x1008
+; FAST-NEXT:    retq
   call void @foo(%struct.large* align 8 byval %x)
   ret void
 
-; ALL-LABEL: test2:
-; NOFAST: rep;movsq
-; NOFAST32: rep;movsl
-; FAST: rep;movsb
 }
 
 %struct.large_oddsize = type { [4095 x i8] }
@@ -38,11 +133,58 @@
 declare void @foo_oddsize(%struct.large_oddsize* align 8 byval) nounwind
 
 define void @test3(%struct.large_oddsize* nocapture %x) nounwind minsize {
+; NOFAST32-LABEL: test3:
+; NOFAST32:       # %bb.0:
+; NOFAST32-NEXT:    pushl %edi
+; NOFAST32-NEXT:    pushl %esi
+; NOFAST32-NEXT:    subl $4100, %esp # imm = 0x1004
+; NOFAST32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOFAST32-NEXT:    movl $4095, %ecx # imm = 0xFFF
+; NOFAST32-NEXT:    movl %esp, %edi
+; NOFAST32-NEXT:    rep;movsb (%esi), %es:(%edi)
+; NOFAST32-NEXT:    calll foo_oddsize
+; NOFAST32-NEXT:    addl $4100, %esp # imm = 0x1004
+; NOFAST32-NEXT:    popl %esi
+; NOFAST32-NEXT:    popl %edi
+; NOFAST32-NEXT:    retl
+;
+; FAST32-LABEL: test3:
+; FAST32:       # %bb.0:
+; FAST32-NEXT:    pushl %edi
+; FAST32-NEXT:    pushl %esi
+; FAST32-NEXT:    subl $4100, %esp # imm = 0x1004
+; FAST32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FAST32-NEXT:    movl $4095, %ecx # imm = 0xFFF
+; FAST32-NEXT:    movl %esp, %edi
+; FAST32-NEXT:    rep;movsb (%esi), %es:(%edi)
+; FAST32-NEXT:    calll foo_oddsize
+; FAST32-NEXT:    addl $4100, %esp # imm = 0x1004
+; FAST32-NEXT:    popl %esi
+; FAST32-NEXT:    popl %edi
+; FAST32-NEXT:    retl
+;
+; NOFAST-LABEL: test3:
+; NOFAST:       # %bb.0:
+; NOFAST-NEXT:    subq $4104, %rsp # imm = 0x1008
+; NOFAST-NEXT:    movq %rdi, %rsi
+; NOFAST-NEXT:    movl $4095, %ecx # imm = 0xFFF
+; NOFAST-NEXT:    movq %rsp, %rdi
+; NOFAST-NEXT:    rep;movsb (%rsi), %es:(%rdi)
+; NOFAST-NEXT:    callq foo_oddsize
+; NOFAST-NEXT:    addq $4104, %rsp # imm = 0x1008
+; NOFAST-NEXT:    retq
+;
+; FAST-LABEL: test3:
+; FAST:       # %bb.0:
+; FAST-NEXT:    subq $4104, %rsp # imm = 0x1008
+; FAST-NEXT:    movq %rdi, %rsi
+; FAST-NEXT:    movl $4095, %ecx # imm = 0xFFF
+; FAST-NEXT:    movq %rsp, %rdi
+; FAST-NEXT:    rep;movsb (%rsi), %es:(%rdi)
+; FAST-NEXT:    callq foo_oddsize
+; FAST-NEXT:    addq $4104, %rsp # imm = 0x1008
+; FAST-NEXT:    retq
   call void @foo_oddsize(%struct.large_oddsize* align 8 byval %x)
   ret void
 
-; ALL-LABEL: test3:
-; NOFAST: rep;movsb
-; NOFAST32: rep;movsb
-; FAST: rep;movsb
 }
diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll
--- a/llvm/test/CodeGen/X86/memcpy.ll
+++ b/llvm/test/CodeGen/X86/memcpy.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=DARWIN
 
@@ -7,24 +8,32 @@
 
 ; Variable memcpy's should lower to calls.
 define i8* @test1(i8* %a, i8* %b, i64 %n) nounwind {
+; LINUX-LABEL: test1:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    jmp memcpy # TAILCALL
+;
+; DARWIN-LABEL: test1:
+; DARWIN:       ## %bb.0: ## %entry
+; DARWIN-NEXT:    jmp _memcpy ## TAILCALL
 entry:
 	tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 %n, i1 0 )
 	ret i8* %a
-        
-; LINUX-LABEL: test1:
-; LINUX: memcpy
 }
 
 ; Variable memcpy's should lower to calls.
 define i8* @test2(i64* %a, i64* %b, i64 %n) nounwind {
+; LINUX-LABEL: test2:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    jmp memcpy # TAILCALL
+;
+; DARWIN-LABEL: test2:
+; DARWIN:       ## %bb.0: ## %entry
+; DARWIN-NEXT:    jmp _memcpy ## TAILCALL
 entry:
 	%tmp14 = bitcast i64* %a to i8*
 	%tmp25 = bitcast i64* %b to i8*
 	tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %tmp14, i8* align 8 %tmp25, i64 %n, i1 0 )
 	ret i8* %tmp14
-        
-; LINUX-LABEL: test2:
-; LINUX: memcpy
 }
 
 ; Large constant memcpy's should lower to a call when optimizing for size.
@@ -34,83 +43,135 @@
 ; hurting performance so it should just ignore optsize when expanding memcpy.
 ; rdar://8821501
 define void @test3(i8* nocapture %A, i8* nocapture %B) nounwind optsize noredzone {
+; LINUX-LABEL: test3:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    movl $64, %edx
+; LINUX-NEXT:    jmp memcpy # TAILCALL
+;
+; DARWIN-LABEL: test3:
+; DARWIN:       ## %bb.0: ## %entry
+; DARWIN-NEXT:    movq 56(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 56(%rdi)
+; DARWIN-NEXT:    movq 48(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 48(%rdi)
+; DARWIN-NEXT:    movq 40(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 40(%rdi)
+; DARWIN-NEXT:    movq 32(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 32(%rdi)
+; DARWIN-NEXT:    movq 24(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 24(%rdi)
+; DARWIN-NEXT:    movq 16(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 16(%rdi)
+; DARWIN-NEXT:    movq (%rsi), %rax
+; DARWIN-NEXT:    movq 8(%rsi), %rcx
+; DARWIN-NEXT:    movq %rcx, 8(%rdi)
+; DARWIN-NEXT:    movq %rax, (%rdi)
+; DARWIN-NEXT:    retq
 entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false)
   ret void
-; LINUX-LABEL: test3:
-; LINUX: memcpy
-
-; DARWIN-LABEL: test3:
-; DARWIN-NOT: memcpy
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
-; DARWIN: movq
 }
 
 define void @test3_minsize(i8* nocapture %A, i8* nocapture %B) nounwind minsize noredzone {
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false)
-  ret void
 ; LINUX-LABEL: test3_minsize:
-; LINUX: memcpy
-
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    pushq $64
+; LINUX-NEXT:    popq %rdx
+; LINUX-NEXT:    jmp memcpy # TAILCALL
+;
 ; DARWIN-LABEL: test3_minsize:
-; DARWIN: memcpy
+; DARWIN:       ## %bb.0:
+; DARWIN-NEXT:    pushq $64
+; DARWIN-NEXT:    popq %rdx
+; DARWIN-NEXT:    jmp _memcpy ## TAILCALL
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false)
+  ret void
 }
 
 define void @test3_minsize_optsize(i8* nocapture %A, i8* nocapture %B) nounwind optsize minsize noredzone {
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false)
-  ret void
 ; LINUX-LABEL: test3_minsize_optsize:
-; LINUX: memcpy
-
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    pushq $64
+; LINUX-NEXT:    popq %rdx
+; LINUX-NEXT:    jmp memcpy # TAILCALL
+;
 ; DARWIN-LABEL: test3_minsize_optsize:
-; DARWIN: memcpy
+; DARWIN:       ## %bb.0:
+; DARWIN-NEXT:    pushq $64
+; DARWIN-NEXT:    popq %rdx
+; DARWIN-NEXT:    jmp _memcpy ## TAILCALL
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false)
+  ret void
 }
 
 ; Large constant memcpy's should be inlined when not optimizing for size.
 define void @test4(i8* nocapture %A, i8* nocapture %B) nounwind noredzone {
+; LINUX-LABEL: test4:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    movq 56(%rsi), %rax
+; LINUX-NEXT:    movq %rax, 56(%rdi)
+; LINUX-NEXT:    movq 48(%rsi), %rax
+; LINUX-NEXT:    movq %rax, 48(%rdi)
+; LINUX-NEXT:    movq 40(%rsi), %rax
+; LINUX-NEXT:    movq %rax, 40(%rdi)
+; LINUX-NEXT:    movq 32(%rsi), %rax
+; LINUX-NEXT:    movq %rax, 32(%rdi)
+; LINUX-NEXT:    movq 24(%rsi), %rax
+; LINUX-NEXT:    movq %rax, 24(%rdi)
+; LINUX-NEXT:    movq 16(%rsi), %rax
+; LINUX-NEXT:    movq %rax, 16(%rdi)
+; LINUX-NEXT:    movq (%rsi), %rax
+; LINUX-NEXT:    movq 8(%rsi), %rcx
+; LINUX-NEXT:    movq %rcx, 8(%rdi)
+; LINUX-NEXT:    movq %rax, (%rdi)
+; LINUX-NEXT:    retq
+;
+; DARWIN-LABEL: test4:
+; DARWIN:       ## %bb.0: ## %entry
+; DARWIN-NEXT:    movq 56(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 56(%rdi)
+; DARWIN-NEXT:    movq 48(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 48(%rdi)
+; DARWIN-NEXT:    movq 40(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 40(%rdi)
+; DARWIN-NEXT:    movq 32(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 32(%rdi)
+; DARWIN-NEXT:    movq 24(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 24(%rdi)
+; DARWIN-NEXT:    movq 16(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 16(%rdi)
+; DARWIN-NEXT:    movq (%rsi), %rax
+; DARWIN-NEXT:    movq 8(%rsi), %rcx
+; DARWIN-NEXT:    movq %rcx, 8(%rdi)
+; DARWIN-NEXT:    movq %rax, (%rdi)
+; DARWIN-NEXT:    retq
 entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false)
   ret void
-; LINUX-LABEL: test4:
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
 }
 
 
 @.str = private unnamed_addr constant [30 x i8] c"\00aaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
 
 define void @test5(i8* nocapture %C) nounwind uwtable ssp {
+; LINUX-LABEL: test5:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    movabsq $7016996765293437281, %rax # imm = 0x6161616161616161
+; LINUX-NEXT:    movq %rax, 8(%rdi)
+; LINUX-NEXT:    movabsq $7016996765293437184, %rax # imm = 0x6161616161616100
+; LINUX-NEXT:    movq %rax, (%rdi)
+; LINUX-NEXT:    retq
+;
+; DARWIN-LABEL: test5:
+; DARWIN:       ## %bb.0: ## %entry
+; DARWIN-NEXT:    movabsq $7016996765293437281, %rax ## imm = 0x6161616161616161
+; DARWIN-NEXT:    movq %rax, 8(%rdi)
+; DARWIN-NEXT:    movabsq $7016996765293437184, %rax ## imm = 0x6161616161616100
+; DARWIN-NEXT:    movq %rax, (%rdi)
+; DARWIN-NEXT:    retq
 entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str, i64 0, i64 0), i64 16, i1 false)
   ret void
-
-; DARWIN-LABEL: test5:
-; DARWIN: movabsq	$7016996765293437281
-; DARWIN: movabsq	$7016996765293437184
 }
 
 
@@ -118,10 +179,18 @@
 @.str2 = private unnamed_addr constant [2 x i8] c"x\00", align 1
 
 define void @test6() nounwind uwtable {
+; LINUX-LABEL: test6:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    movw $0, 8
+; LINUX-NEXT:    movq $120, 0
+; LINUX-NEXT:    retq
+;
+; DARWIN-LABEL: test6:
+; DARWIN:       ## %bb.0: ## %entry
+; DARWIN-NEXT:    movw $0, 8
+; DARWIN-NEXT:    movq $120, 0
+; DARWIN-NEXT:    retq
 entry:
-; DARWIN: test6
-; DARWIN: movw $0, 8
-; DARWIN: movq $120, 0
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* null, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str2, i64 0, i64 0), i64 10, i1 false)
   ret void
 }
@@ -129,13 +198,25 @@
 define void @PR15348(i8* %a, i8* %b) {
 ; Ensure that alignment of '0' in an @llvm.memcpy intrinsic results in
 ; unaligned loads and stores.
-; LINUX: PR15348
-; LINUX: movb
-; LINUX: movb
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
-; LINUX: movq
+; LINUX-LABEL: PR15348:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    movb 16(%rsi), %al
+; LINUX-NEXT:    movb %al, 16(%rdi)
+; LINUX-NEXT:    movq (%rsi), %rax
+; LINUX-NEXT:    movq 8(%rsi), %rcx
+; LINUX-NEXT:    movq %rcx, 8(%rdi)
+; LINUX-NEXT:    movq %rax, (%rdi)
+; LINUX-NEXT:    retq
+;
+; DARWIN-LABEL: PR15348:
+; DARWIN:       ## %bb.0:
+; DARWIN-NEXT:    movb 16(%rsi), %al
+; DARWIN-NEXT:    movb %al, 16(%rdi)
+; DARWIN-NEXT:    movq (%rsi), %rax
+; DARWIN-NEXT:    movq 8(%rsi), %rcx
+; DARWIN-NEXT:    movq %rcx, 8(%rdi)
+; DARWIN-NEXT:    movq %rax, (%rdi)
+; DARWIN-NEXT:    retq
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 17, i1 false)
   ret void
 }
@@ -143,11 +224,21 @@
 ; Memcpys from / to address space 256 should be lowered to appropriate loads /
 ; stores if small enough.
 define void @addrspace256(i8 addrspace(256)* %a, i8 addrspace(256)* %b) nounwind {
+; LINUX-LABEL: addrspace256:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    movq %gs:(%rsi), %rax
+; LINUX-NEXT:    movq %gs:8(%rsi), %rcx
+; LINUX-NEXT:    movq %rcx, %gs:8(%rdi)
+; LINUX-NEXT:    movq %rax, %gs:(%rdi)
+; LINUX-NEXT:    retq
+;
+; DARWIN-LABEL: addrspace256:
+; DARWIN:       ## %bb.0:
+; DARWIN-NEXT:    movq %gs:(%rsi), %rax
+; DARWIN-NEXT:    movq %gs:8(%rsi), %rcx
+; DARWIN-NEXT:    movq %rcx, %gs:8(%rdi)
+; DARWIN-NEXT:    movq %rax, %gs:(%rdi)
+; DARWIN-NEXT:    retq
   tail call void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* align 8 %a, i8 addrspace(256)* align 8 %b, i64 16, i1 false)
   ret void
-; LINUX-LABEL: addrspace256:
-; LINUX: movq %gs:
-; LINUX: movq %gs:
-; LINUX: movq {{.*}}, %gs:
-; LINUX: movq {{.*}}, %gs:
 }
diff --git a/llvm/test/CodeGen/X86/stack-align-memcpy.ll b/llvm/test/CodeGen/X86/stack-align-memcpy.ll
--- a/llvm/test/CodeGen/X86/stack-align-memcpy.ll
+++ b/llvm/test/CodeGen/X86/stack-align-memcpy.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -stackrealign -mtriple i386-apple-darwin -mcpu=i486 | FileCheck %s
 
 %struct.foo = type { [88 x i8] }
@@ -8,36 +9,229 @@
 ; PR15249
 ; We can't use rep;movsl here because it clobbers the base pointer in %esi.
 define void @test1(%struct.foo* nocapture %x, i32 %y) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $80, %esp
+; CHECK-NEXT:    movl %esp, %esi
+; CHECK-NEXT:    movl 8(%ebp), %ecx
+; CHECK-NEXT:    movl 12(%ebp), %edx
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    addl $15, %edx
+; CHECK-NEXT:    andl $-16, %edx
+; CHECK-NEXT:    subl %edx, %eax
+; CHECK-NEXT:    movl %eax, %esp
+; CHECK-NEXT:    subl $4, %esp
+; CHECK-NEXT:    movl 84(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 68(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 80(%ecx), %edi
+; CHECK-NEXT:    movl 76(%ecx), %ebx
+; CHECK-NEXT:    movl 72(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 64(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 68(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 60(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 64(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 56(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 60(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 52(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 56(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 48(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 52(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 44(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 48(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 40(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 44(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 36(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 40(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 32(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 36(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 28(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 32(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 24(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 28(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 20(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 24(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 16(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 20(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 12(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 16(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 8(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 12(%ecx), %edx
+; CHECK-NEXT:    movl %edx, 4(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 8(%ecx), %edx
+; CHECK-NEXT:    movl %edx, (%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl (%ecx), %edx
+; CHECK-NEXT:    movl %edx, 72(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 4(%ecx), %ecx
+; CHECK-NEXT:    pushl 68(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl 64(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 60(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 56(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 52(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 48(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 44(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 40(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 36(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 32(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 28(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 24(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 20(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 16(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 12(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 8(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 4(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl (%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    pushl 72(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    calll _bar
+; CHECK-NEXT:    leal -12(%ebp), %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
   %dynalloc = alloca i8, i32 %y, align 1
   call void @bar(i8* %dynalloc, %struct.foo* align 4 byval %x)
   ret void
-
-; CHECK-LABEL: test1:
-; CHECK: andl $-16, %esp
-; CHECK: movl %esp, %esi
-; CHECK-NOT: rep;movsl
 }
 
 ; PR19012
 ; Also don't clobber %esi if the dynamic alloca comes after the memcpy.
 define void @test2(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $80, %esp
+; CHECK-NEXT:    movl %esp, %esi
+; CHECK-NEXT:    movl 12(%ebp), %edi
+; CHECK-NEXT:    movl 8(%ebp), %eax
+; CHECK-NEXT:    subl $4, %esp
+; CHECK-NEXT:    movl 84(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 68(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 80(%eax), %edx
+; CHECK-NEXT:    movl 76(%eax), %ebx
+; CHECK-NEXT:    movl 72(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 64(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 68(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 60(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 64(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 56(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 60(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 52(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 56(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 48(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 52(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 44(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 48(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 40(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 44(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 36(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 40(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 32(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 36(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 28(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 32(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 24(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 28(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 20(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 24(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 16(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 20(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 12(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 16(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 8(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 12(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 4(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 8(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, (%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl (%eax), %ecx
+; CHECK-NEXT:    movl %ecx, 72(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 4(%eax), %eax
+; CHECK-NEXT:    pushl 68(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl 64(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 60(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 56(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 52(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 48(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 44(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 40(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 36(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 32(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 28(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 24(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 20(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 16(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 12(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 8(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 4(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl (%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl 72(%esi) ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl 16(%ebp)
+; CHECK-NEXT:    calll _bar
+; CHECK-NEXT:    addl $96, %esp
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    addl $15, %edi
+; CHECK-NEXT:    andl $-16, %edi
+; CHECK-NEXT:    subl %edi, %eax
+; CHECK-NEXT:    movl %eax, %esp
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    calll _baz
+; CHECK-NEXT:    leal -12(%ebp), %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
   call void @bar(i8* %z, %struct.foo* align 4 byval %x)
   %dynalloc = alloca i8, i32 %y, align 1
   call void @baz(i8* %dynalloc)
   ret void
-
-; CHECK-LABEL: test2:
-; CHECK: movl %esp, %esi
-; CHECK-NOT: rep;movsl
 }
 
 ; Check that we do use rep movs if we make the alloca static.
 define void @test3(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $112, %esp
+; CHECK-NEXT:    movl 16(%ebp), %eax
+; CHECK-NEXT:    movl 8(%ebp), %esi
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl $22, %ecx
+; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    calll _bar
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    calll _baz
+; CHECK-NEXT:    leal -8(%ebp), %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
   call void @bar(i8* %z, %struct.foo* align 4 byval %x)
   %statalloc = alloca i8, i32 8, align 1
   call void @baz(i8* %statalloc)
   ret void
-
-; CHECK-LABEL: test3:
-; CHECK: rep;movsl
 }