Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -121,8 +121,6 @@
     return MI;
   }
 
-  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
-  // load/store.
   ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
   if (!MemOpLength) return nullptr;
 
@@ -133,8 +131,16 @@
   uint64_t Size = MemOpLength->getLimitedValue();
   assert(Size && "0-sized memory transferring should be removed already.");
 
-  if (Size > 8 || (Size&(Size-1)))
-    return nullptr;  // If not 1/2/4/8 bytes, exit.
+  // Since we don't have perfect knowledge here, make some assumptions: assume
+  // the maximum allowed stores for memcpy operation is the same size as the
+  // largest legal integer size.
+  unsigned LargestInt = DL.getLargestLegalIntTypeSizeInBits();
+
+  if (LargestInt == 0)
+    LargestInt = 32;
+ 
+  if (!LargestInt || Size > LargestInt || (Size&(Size-1)))
+    return nullptr;
 
   // Use an integer load+store unless we can find something better.
   unsigned SrcAddrSp =
Index: test/DebugInfo/X86/array2.ll
===================================================================
--- test/DebugInfo/X86/array2.ll
+++ test/DebugInfo/X86/array2.ll
@@ -16,7 +16,9 @@
 ; Test that we correctly lower dbg.declares for arrays.
 ;
 ; CHECK: define i32 @main
-; CHECK: call void @llvm.dbg.value(metadata i32 42, metadata ![[ARRAY:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32))
+; CHECK: tail call void @llvm.dbg.value(metadata i32 [[ARGC:%.*]], i64 0, metadata !22, metadata !12), !dbg !23
+; CHECK: tail call void @llvm.dbg.value(metadata i8** [[ARGV:%.*]], i64 0, metadata !24, metadata !12), !dbg !23
+; CHECK: tail call void @llvm.dbg.value(metadata i32 42, metadata ![[ARRAY:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32))
 ; CHECK: ![[ARRAY]] = !DILocalVariable(name: "array",{{.*}} line: 6
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
Index: test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll
===================================================================
--- test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll
+++ test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -instcombine -S | not grep call
 ; RUN: opt < %s -O3 -S | not grep xyz
+target triple = "x86_64-unknown-linux-gnu"
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 @.str = internal constant [4 x i8] c"xyz\00"		; <[4 x i8]*> [#uses=1]
Index: test/Transforms/InstCombine/alloca.ll
===================================================================
--- test/Transforms/InstCombine/alloca.ll
+++ test/Transforms/InstCombine/alloca.ll
@@ -144,7 +144,6 @@
 entry:
   %inalloca.save = call i8* @llvm.stacksave()
   %argmem = alloca inalloca <{ %struct_type }>
-; CHECK: alloca inalloca i64, align 8
   %0 = getelementptr inbounds <{ %struct_type }>, <{ %struct_type }>* %argmem, i32 0, i32 0
   %1 = bitcast %struct_type* %0 to i8*
   %2 = bitcast %struct_type* %a to i8*
Index: test/Transforms/InstCombine/element-atomic-memintrins.ll
===================================================================
--- test/Transforms/InstCombine/element-atomic-memintrins.ll
+++ test/Transforms/InstCombine/element-atomic-memintrins.ll
@@ -97,8 +97,9 @@
 ; Check that a memmove from a global constant is converted into a memcpy
 define void @test_memmove_to_memcpy(i8* %dest) {
 ; CHECK-LABEL: @test_memmove_to_memcpy(
-; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST:%.*]], i8* align 16 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1)
-; CHECK-NEXT:    ret void
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: store atomic
+; CHECK-NEXT: ret void
 ;
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1)
   ret void
@@ -146,7 +147,10 @@
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64*
 ; CHECK-NEXT:    [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1
 ; CHECK-NEXT:    store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1
-; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT:    [[TMP13:%.*]] = load atomic i64, i64* [[TMP11]] unordered, align 1
+; CHECK-NEXT:    store atomic i64 [[TMP13]], i64* [[TMP12]] unordered, align 1
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
Index: test/Transforms/InstCombine/memcpy-to-load.ll
===================================================================
--- test/Transforms/InstCombine/memcpy-to-load.ll
+++ test/Transforms/InstCombine/memcpy-to-load.ll
@@ -65,22 +65,67 @@
 }
 
 define void @copy_8_bytes(i8* %d, i8* %s) {
-; ALL-LABEL: @copy_8_bytes(
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i64*
-; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i64*
-; ALL-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1
-; ALL-NEXT:    store i64 [[TMP3]], i64* [[TMP2]], align 1
-; ALL-NEXT:    ret void
+define void @copy_8_bytes(i8* %d, i8* %s) {
+; If there is no datalayout, then all memcpy of size less than 16 bytes (and power-of-2) will be expanded inline with load/store
+; NODL-LABEL: @copy_8_bytes(
+; NODL-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 8, i32 1, i1 false)
+; NODL-NEXT:    ret void
+;
+; For datalayout with largest legal integer type size of 4 bytes, all memcpy with size less than 8 bytes (and power-of-2) will be expanded inline with load/store
+;
+; I32-LABEL: @copy_8_bytes(
+; I32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 8, i32 1, i1 false)
+; I32-NEXT:    ret void
 ;
+; For datalayout with largest legal integer type size of 8 bytes, all memcpy with size less than 16 bytes (and power-of-2) will be expanded inline with load/store
+;
+; I64-LABEL: @copy_8_bytes(
+; I64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i64*
+; I64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i64*
+; I64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1
+; I64-NEXT:    store i64 [[TMP3]], i64* [[TMP2]], align 1
+; I64-NEXT:    ret void
+;
+; For datalayout with largest legal integer type size of 16 bytes, all memcpy with size less than 32 bytes (and power-of-2) will be expanded inline with load/store
+;
+; I128-LABEL: @copy_8_bytes(
+; I128-NEXT:    [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i64*
+; I128-NEXT:    [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i64*
+; I128-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1
+; I128-NEXT:    store i64 [[TMP3]], i64* [[TMP2]], align 1
+; I128-NEXT:    ret void
+
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 8, i1 false)
   ret void
 }
 
 define void @copy_16_bytes(i8* %d, i8* %s) {
-; ALL-LABEL: @copy_16_bytes(
-; ALL-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[D:%.*]], i8* align 1 [[S:%.*]], i32 16, i1 false)
-; ALL-NEXT:    ret void
+; If there is no datalayout, then all memcpy of size less than 16 bytes (and power-of-2) will be expanded inline with load/store
+; NODL-LABEL: @copy_16_bytes(
+; NODL-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 16, i32 1, i1 false)
+; NODL-NEXT:    ret void
+;
+; For datalayout with largest legal integer type size of 4 bytes, all memcpy with size less than 8 bytes (and power-of-2) will be expanded inline with load/store
+;
+; I32-LABEL: @copy_16_bytes(
+; I32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 16, i32 1, i1 false)
+; I32-NEXT:    ret void
+;
+; For datalayout with largest legal integer type size of 8 bytes, all memcpy with size less than 16 bytes (and power-of-2) will be expanded inline with load/store
 ;
+; I64-LABEL: @copy_16_bytes(
+; I64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 16, i32 1, i1 false)
+; I64-NEXT:    ret void
+;
+; For datalayout with largest legal integer type size of 16 bytes, all memcpy with size less than 32 bytes (and power-of-2) will be expanded inline with load/store
+;
+; I128-LABEL: @copy_16_bytes(
+; I128-NEXT:    [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i128*
+; I128-NEXT:    [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i128*
+; I128-NEXT:    [[TMP3:%.*]] = load i128, i128* [[TMP1]], align 1
+; I128-NEXT:    store i128 [[TMP3]], i128* [[TMP2]], align 1
+; I128-NEXT:    ret void
+
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 16, i1 false)
   ret void
 }