Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -182,8 +182,6 @@
     return MI;
   }
 
-  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
-  // load/store.
   ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
   if (!MemOpLength) return nullptr;
 
@@ -194,8 +192,16 @@
   uint64_t Size = MemOpLength->getLimitedValue();
   assert(Size && "0-sized memory transferring should be removed already.");
 
-  if (Size > 8 || (Size&(Size-1)))
-    return nullptr;  // If not 1/2/4/8 bytes, exit.
+
+  // Since we don't have perfect knowledge here, make some assumptions: assume
+  // the maximum allowed stores for memcpy operation is the same size as the 
+  // largest legal integer size. 
+  unsigned LargestInt = DL.getLargestLegalIntTypeSizeInBits();
+  if (LargestInt == 0)
+    LargestInt = 32;
+
+  if (Size > 2*LargestInt/8 || (Size&(Size-1)))
+    return nullptr;  
 
   // Use an integer load+store unless we can find something better.
   unsigned SrcAddrSp =
Index: test/DebugInfo/X86/array2.ll
===================================================================
--- test/DebugInfo/X86/array2.ll
+++ test/DebugInfo/X86/array2.ll
@@ -16,9 +16,8 @@
 ; Test that we correctly lower dbg.declares for arrays.
 ;
 ; CHECK: define i32 @main
-; CHECK: call void @llvm.dbg.value(metadata i32 42, i64 0, metadata ![[ARRAY:[0-9]+]], metadata ![[EXPR:[0-9]+]])
-; CHECK: ![[ARRAY]] = !DILocalVariable(name: "array",{{.*}} line: 6
-; CHECK: ![[EXPR]] = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK: tail call void @llvm.dbg.value(metadata i32 [[ARGC:%.*]], i64 0, metadata !22, metadata !12), !dbg !23
+; CHECK: tail call void @llvm.dbg.value(metadata i8** [[ARGV:%.*]], i64 0, metadata !24, metadata !12), !dbg !23
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
 
Index: test/Transforms/InstCombine/builtin_memcpy_patterns.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/builtin_memcpy_patterns.ll
@@ -0,0 +1,181 @@
+; RUN: opt -instcombine  < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = local_unnamed_addr global [16 x i32] [i32 3, i32 4, i32 5, i32 56, i32 6, i32 7, i32 78, i32 6, i32 3, i32 4, i32 54, i32 5, i32 1, i32 2, i32 3, i32 3], align 16
+
+;This test checks if the builtin memcpy is converted to memcpy or to load and store operations
+
+;The function foo has a builtin memcpy of size 16
+; Function Attrs: nounwind uwtable
+define void @foo(i8* %a, i8* %b) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to i128*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[A:%.*]] to i128*
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, i128* [[TMP0]], align 1
+; CHECK-NEXT:    store i128 [[TMP2]], i128* [[TMP1]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
+;The function foo1 has loop which has a builtin memcpy of size 8
+
+; Function Attrs: nounwind uwtable
+define void @foo1(i32* %a, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_0]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[I_0]], 1
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* @b, i64 0, i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR3]] to i64*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ADD_PTR]] to i64*
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]], align 8
+; CHECK-NEXT:    store i64 [[TMP2]], i64* [[TMP1]], align 4
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul nsw i32 2, %i.0
+  %idx.ext = sext i32 %mul to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = bitcast i32* %add.ptr to i8*
+  %add.ptr3 = getelementptr inbounds i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b, i32 0, i32 0), i64 %idx.ext
+  %1 = bitcast i32* %add.ptr3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 8, i32 4, i1 false)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+
+;The function foo2 has loop which has a builtin memcpy of size 16
+
+; Function Attrs: nounwind uwtable
+define void @foo2(i32* %a, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_0]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[I_0]], 2
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* @b, i64 0, i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR3]] to i128*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ADD_PTR]] to i128*
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, i128* [[TMP0]], align 16
+; CHECK-NEXT:    store i128 [[TMP2]], i128* [[TMP1]], align 4
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul nsw i32 4, %i.0
+  %idx.ext = sext i32 %mul to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = bitcast i32* %add.ptr to i8*
+  %add.ptr3 = getelementptr inbounds i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b, i32 0, i32 0), i64 %idx.ext
+  %1 = bitcast i32* %add.ptr3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+
+;The function foo3 has loop which has a builtin memcpy of size 32
+
+; Function Attrs: nounwind uwtable
+define void @foo3(i32* %a, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_0]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[I_0]], 3
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR]] to i8*
+; CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* @b, i64 0, i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ADD_PTR3]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 4, i1 false)
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul nsw i32 8, %i.0
+  %idx.ext = sext i32 %mul to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = bitcast i32* %add.ptr to i8*
+  %add.ptr3 = getelementptr inbounds i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b, i32 0, i32 0), i64 %idx.ext
+  %1 = bitcast i32* %add.ptr3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 4, i1 false)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (https://github.com/llvm-mirror/clang.git 6cfb5bf41823be28bca09fe72dd3d4b83f4e1be8) (https://github.com/llvm-mirror/llvm.git 8708d57bbe53f61feb4630e0ac50fb938dd9a33b)"}
Index: test/Transforms/InstCombine/memcpy-to-load.ll
===================================================================
--- test/Transforms/InstCombine/memcpy-to-load.ll
+++ test/Transforms/InstCombine/memcpy-to-load.ll
@@ -77,9 +77,36 @@
 }
 
 define void @copy_16_bytes(i8* %d, i8* %s) {
-; ALL-LABEL: @copy_16_bytes(
-; ALL-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 16, i32 1, i1 false)
-; ALL-NEXT:    ret void
+
+; If there is no datalayout, then all memcpy of size less than 16 (and power-of-2) will be expanded inline with load/store
+
+; NODL-LABEL: @copy_16_bytes(
+; NODL-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 16, i32 1, i1 false)
+; NODL-NEXT:    ret void
+;
+; For datalayout with largest legal integer type size of 32, all memcpy with size less than 16 (and power-of-2) will be expanded inline with load/store
+;
+; I32-LABEL: @copy_16_bytes(
+; I32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 16, i32 1, i1 false)
+; I32-NEXT:    ret void
+;
+; For datalayout with largest legal integer type size of 64, all memcpy with size less than 32 (and power-of-2) will be expanded inline with load/store
+;
+; I64-LABEL: @copy_16_bytes(
+; I64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i128*
+; I64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i128*
+; I64-NEXT:    [[TMP3:%.*]] = load i128, i128* [[TMP1]], align 1
+; I64-NEXT:    store i128 [[TMP3]], i128* [[TMP2]], align 1
+; I64-NEXT:    ret void
+;
+; For datalayout with largest legal integer type size of 128, all memcpy with size less than 64 (and power-of-2) will be expanded inline with load/store
+;
+; I128-LABEL: @copy_16_bytes(
+; I128-NEXT:    [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i128*
+; I128-NEXT:    [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i128*
+; I128-NEXT:    [[TMP3:%.*]] = load i128, i128* [[TMP1]], align 1
+; I128-NEXT:    store i128 [[TMP3]], i128* [[TMP2]], align 1
+; I128-NEXT:    ret void
 ;
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 16, i32 1, i1 false)
   ret void