Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -182,8 +182,6 @@ return MI; } - // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with - // load/store. ConstantInt *MemOpLength = dyn_cast(MI->getArgOperand(2)); if (!MemOpLength) return nullptr; @@ -194,8 +192,16 @@ uint64_t Size = MemOpLength->getLimitedValue(); assert(Size && "0-sized memory transferring should be removed already."); - if (Size > 8 || (Size&(Size-1))) - return nullptr; // If not 1/2/4/8 bytes, exit. + + // Since we don't have perfect knowledge here, make some assumptions: assume + // the maximum allowed stores for memcpy operation is the same size as the + // largest legal integer size. + unsigned LargestInt = DL.getLargestLegalIntTypeSizeInBits(); + if (LargestInt == 0) + LargestInt = 32; + + if (Size > 2*LargestInt/8 || (Size&(Size-1))) + return nullptr; // Use an integer load+store unless we can find something better. unsigned SrcAddrSp = Index: output_bultin_memcpy_patterns.ll =================================================================== --- output_bultin_memcpy_patterns.ll +++ output_bultin_memcpy_patterns.ll @@ -8,7 +8,10 @@ ; Function Attrs: nounwind uwtable define void @foo(i8* %a, i8* %b) local_unnamed_addr #0 { entry: - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i32 1, i1 false) + %0 = bitcast i8* %b to i128* + %1 = bitcast i8* %a to i128* + %2 = load i128, i128* %0, align 1 + store i128 %2, i128* %1, align 1 ret void } @@ -55,10 +58,11 @@ %mul = shl nsw i32 %i.0, 2 %idx.ext = sext i32 %mul to i64 %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext - %0 = bitcast i32* %add.ptr to i8* %add.ptr3 = getelementptr inbounds [16 x i32], [16 x i32]* @b, i64 0, i64 %idx.ext - %1 = bitcast i32* %add.ptr3 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false) + %0 = bitcast i32* %add.ptr3 to i128* + %1 = bitcast i32* %add.ptr to i128* + %2 = load i128, i128* %0, align 16 + store i128 %2, i128* %1, align 4 %inc = add nsw i32 %i.0, 1 br label %for.cond Index: test/DebugInfo/X86/array2.ll =================================================================== --- test/DebugInfo/X86/array2.ll +++ test/DebugInfo/X86/array2.ll @@ -53,7 +53,7 @@ call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !25, metadata !DIExpression()), !dbg !24 call void @llvm.dbg.declare(metadata [4 x i32]* %array, metadata !26, metadata !DIExpression()), !dbg !30 %0 = bitcast [4 x i32]* %array to i8*, !dbg !30 - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !30 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([4 x i32]* @main.array to i8*), i64 128, i32 16, i1 false), !dbg !30 %arraydecay = getelementptr inbounds [4 x i32], [4 x i32]* %array, i32 0, i32 0, !dbg !31 call void @f(i32* %arraydecay), !dbg !31 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %array, i32 0, i64 0, !dbg !32