Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -121,8 +121,6 @@ return MI; } - // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with - // load/store. ConstantInt *MemOpLength = dyn_cast(MI->getLength()); if (!MemOpLength) return nullptr; @@ -133,8 +131,16 @@ uint64_t Size = MemOpLength->getLimitedValue(); assert(Size && "0-sized memory transferring should be removed already."); - if (Size > 8 || (Size&(Size-1))) - return nullptr; // If not 1/2/4/8 bytes, exit. + // Since we don't have perfect knowledge here, make some assumptions: assume + // the maximum allowed stores for memcpy operation is the same size as the + // largest legal integer size. + unsigned LargestInt = DL.getLargestLegalIntTypeSizeInBits(); + + if (LargestInt == 0) + LargestInt = 32; + + if (!LargestInt || Size > LargestInt || (Size&(Size-1))) + return nullptr; // Use an integer load+store unless we can find something better. unsigned SrcAddrSp = Index: test/DebugInfo/X86/array2.ll =================================================================== --- test/DebugInfo/X86/array2.ll +++ test/DebugInfo/X86/array2.ll @@ -16,7 +16,9 @@ ; Test that we correctly lower dbg.declares for arrays. ; ; CHECK: define i32 @main -; CHECK: call void @llvm.dbg.value(metadata i32 42, metadata ![[ARRAY:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32)) +; CHECK: tail call void @llvm.dbg.value(metadata i32 [[ARGC:%.*]], i64 0, metadata !22, metadata !12), !dbg !23 +; CHECK: tail call void @llvm.dbg.value(metadata i8** [[ARGV:%.*]], i64 0, metadata !24, metadata !12), !dbg !23 +; CHECK: tail call void @llvm.dbg.value(metadata i32 42, metadata ![[ARRAY:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32)) ; CHECK: ![[ARRAY]] = !DILocalVariable(name: "array",{{.*}} line: 6 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" Index: test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll =================================================================== --- test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll +++ test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -instcombine -S | not grep call ; RUN: opt < %s -O3 -S | not grep xyz +target triple = "x86_64-unknown-linux-gnu" target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" @.str = internal constant [4 x i8] c"xyz\00" ; <[4 x i8]*> [#uses=1] Index: test/Transforms/InstCombine/alloca.ll =================================================================== --- test/Transforms/InstCombine/alloca.ll +++ test/Transforms/InstCombine/alloca.ll @@ -144,7 +144,6 @@ entry: %inalloca.save = call i8* @llvm.stacksave() %argmem = alloca inalloca <{ %struct_type }> -; CHECK: alloca inalloca i64, align 8 %0 = getelementptr inbounds <{ %struct_type }>, <{ %struct_type }>* %argmem, i32 0, i32 0 %1 = bitcast %struct_type* %0 to i8* %2 = bitcast %struct_type* %a to i8* Index: test/Transforms/InstCombine/element-atomic-memintrins.ll =================================================================== --- test/Transforms/InstCombine/element-atomic-memintrins.ll +++ test/Transforms/InstCombine/element-atomic-memintrins.ll @@ -97,8 +97,9 @@ ; Check that a memmove from a global constant is converted into a memcpy define void @test_memmove_to_memcpy(i8* %dest) { ; CHECK-LABEL: @test_memmove_to_memcpy( -; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST:%.*]], i8* align 16 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1) -; CHECK-NEXT: ret void +; CHECK-NEXT: bitcast +; CHECK-NEXT: store atomic +; CHECK-NEXT: ret void ; call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1) ret void @@ -146,7 +147,10 @@ ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64* ; CHECK-NEXT: [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1 ; CHECK-NEXT: store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1 -; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[SRC]] to i64* +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[DEST]] to i64* +; CHECK-NEXT: [[TMP13:%.*]] = load atomic i64, i64* [[TMP11]] unordered, align 1 +; CHECK-NEXT: store atomic i64 [[TMP13]], i64* [[TMP12]] unordered, align 1 ; CHECK-NEXT: ret void ; call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1) Index: test/Transforms/InstCombine/memcpy-to-load.ll =================================================================== --- test/Transforms/InstCombine/memcpy-to-load.ll +++ test/Transforms/InstCombine/memcpy-to-load.ll @@ -65,22 +65,67 @@ } define void @copy_8_bytes(i8* %d, i8* %s) { -; ALL-LABEL: @copy_8_bytes( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i64* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i64* -; ALL-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1 -; ALL-NEXT: store i64 [[TMP3]], i64* [[TMP2]], align 1 -; ALL-NEXT: ret void +define void @copy_8_bytes(i8* %d, i8* %s) { +; If there is no datalayout, then all memcpy of size less than 16 bytes (and power-of-2) will be expanded inline with load/store +; NODL-LABEL: @copy_8_bytes( +; NODL-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 8, i32 1, i1 false) +; NODL-NEXT: ret void +; +; For datalayout with largest legal integer type size of 4 bytes, all memcpy with size less than 8 bytes (and power-of-2) will be expanded inline with load/store +; +; I32-LABEL: @copy_8_bytes( +; I32-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 8, i32 1, i1 false) +; I32-NEXT: ret void ; +; For datalayout with largest legal integer type size of 8 bytes, all memcpy with size less than 16 bytes (and power-of-2) will be expanded inline with load/store +; +; I64-LABEL: @copy_8_bytes( +; I64-NEXT: [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i64* +; I64-NEXT: [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i64* +; I64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1 +; I64-NEXT: store i64 [[TMP3]], i64* [[TMP2]], align 1 +; I64-NEXT: ret void +; +; For datalayout with largest legal integer type size of 16 bytes, all memcpy with size less than 32 bytes (and power-of-2) will be expanded inline with load/store +; +; I128-LABEL: @copy_8_bytes( +; I128-NEXT: [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i64* +; I128-NEXT: [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i64* +; I128-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1 +; I128-NEXT: store i64 [[TMP3]], i64* [[TMP2]], align 1 +; I128-NEXT: ret void + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 8, i1 false) ret void } define void @copy_16_bytes(i8* %d, i8* %s) { -; ALL-LABEL: @copy_16_bytes( -; ALL-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[D:%.*]], i8* align 1 [[S:%.*]], i32 16, i1 false) -; ALL-NEXT: ret void +; If there is no datalayout, then all memcpy of size less than 16 bytes (and power-of-2) will be expanded inline with load/store +; NODL-LABEL: @copy_16_bytes( +; NODL-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 16, i32 1, i1 false) +; NODL-NEXT: ret void +; +; For datalayout with largest legal integer type size of 4 bytes, all memcpy with size less than 8 bytes (and power-of-2) will be expanded inline with load/store +; +; I32-LABEL: @copy_16_bytes( +; I32-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 16, i32 1, i1 false) +; I32-NEXT: ret void +; +; For datalayout with largest legal integer type size of 8 bytes, all memcpy with size less than 16 bytes (and power-of-2) will be expanded inline with load/store ; +; I64-LABEL: @copy_16_bytes( +; I64-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[D:%.*]], i8* [[S:%.*]], i32 16, i32 1, i1 false) +; I64-NEXT: ret void +; +; For datalayout with largest legal integer type size of 16 bytes, all memcpy with size less than 32 bytes (and power-of-2) will be expanded inline with load/store +; +; I128-LABEL: @copy_16_bytes( +; I128-NEXT: [[TMP1:%.*]] = bitcast i8* [[S:%.*]] to i128* +; I128-NEXT: [[TMP2:%.*]] = bitcast i8* [[D:%.*]] to i128* +; I128-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]], align 1 +; I128-NEXT: store i128 [[TMP3]], i128* [[TMP2]], align 1 +; I128-NEXT: ret void + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 16, i1 false) ret void }