diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -66,8 +66,8 @@ bool tryCombineBr(MachineInstr &MI); /// Optimize memcpy intrinsics et al, e.g. constant len calls. - /// - bool tryCombineMemCpyFamily(MachineInstr &MI); + /// /p MaxLen if non-zero specifies the max length of a mem libcall to inline. + bool tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0); /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -861,7 +861,7 @@ return true; } -bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI) { +bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { // This combine is fairly complex so it's not written with a separate // matcher function. assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); @@ -900,6 +900,9 @@ return true; } + if (MaxLen && KnownLen > MaxLen) + return false; + if (ID == Intrinsic::memcpy) return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); if (ID == Intrinsic::memmove) diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp --- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -56,9 +56,12 @@ case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: { + // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other + // heuristics decide. + unsigned MaxLen = EnableOpt ? 0 : 32; // Try to inline memcpy type calls if optimizations are enabled. - return (EnableOpt && !EnableOptSize) ? Helper.tryCombineMemCpyFamily(MI) - : false; + return (!EnableOptSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen) + : false; } default: break; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-small-memcpy.mir @@ -0,0 +1,86 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=aarch64 -run-pass=aarch64-prelegalizer-combiner -O0 -verify-machineinstrs %s -o - | FileCheck %s +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "arm64-apple-darwin" + + declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1 immarg) #1 + + define void @test_small_memcpy(i32* nocapture %dst, i32* nocapture readonly %src) { + entry: + %0 = bitcast i32* %dst to i8* + %1 = bitcast i32* %src to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 32, i1 false) + ret void + } + + define void @test_large_memcpy(i32* nocapture %dst, i32* nocapture readonly %src) { + entry: + %0 = bitcast i32* %dst to i8* + %1 = bitcast i32* %src to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 36, i1 false) + ret void + } + + attributes #1 = { argmemonly nounwind } + +... +--- +name: test_small_memcpy +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_small_memcpy + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.1, align 4) + ; CHECK: G_STORE [[LOAD]](s128), [[COPY]](p0) :: (store 16 into %ir.0, align 4) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p0) :: (load 16 from %ir.1 + 16, align 4) + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) + ; CHECK: G_STORE [[LOAD1]](s128), [[GEP1]](p0) :: (store 16 into %ir.0 + 16, align 4) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(s64) = G_CONSTANT i64 32 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + RET_ReallyLR + +... +--- +name: test_large_memcpy +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_large_memcpy + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[COPY]](p0), [[COPY1]](p0), [[C]](s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(s64) = G_CONSTANT i64 36 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + RET_ReallyLR + +...