Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -584,6 +584,12 @@ return Cost; } +int TargetTransformInfo::getMemcpyCost(const Instruction *I) const { + int Cost = TTIImpl->getMemcpyCost(I); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + int TargetTransformInfo::getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const { int Cost = TTIImpl->getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm); Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -148,6 +148,8 @@ return ST->getMaxInterleaveFactor(); } + int getMemcpyCost(const Instruction *I); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" @@ -401,6 +402,41 @@ return 1; } +int ARMTTIImpl::getMemcpyCost(const Instruction *I) { + const MemCpyInst *MI = dyn_cast(I); + assert(MI && "MemcpyInst expected"); + ConstantInt *C = dyn_cast(MI->getLength()); + + // This is a somewhat arbitrary number, which basically says that a libcall + // is more expensive than 'TCC_Expensive = 4'. + const unsigned LibCallCost = 6; + + // If size is not a constant, a library call will be generated. + if (!C) + return LibCallCost; + + const unsigned Size = C->getValue().getZExtValue(); + const unsigned DstAlign = MI->getDestAlignment(); + const unsigned SrcAlign = MI->getSourceAlignment(); + + if (Size >= 32 && (DstAlign % 4 != 0 || SrcAlign % 4 != 0)) + return LibCallCost; + + const Function *F = I->getParent()->getParent(); + std::vector MemOps; + + getTLI()->findOptimalMemOpLowering(MemOps, + ~0 /*Limit*/, + Size, DstAlign, SrcAlign, + false /*IsMemset*/, + false /*ZeroMemset*/, + false /*MemcpyStrSrc*/, + false /*AllowOverlap*/, + MI->getDestAddressSpace(), + MI->getSourceAddressSpace(), *F); + return MemOps.size() * 2; + } + int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { if (Kind == TTI::SK_Broadcast) { Index: test/Analysis/CostModel/ARM/memcpy.ll =================================================================== --- test/Analysis/CostModel/ARM/memcpy.ll +++ test/Analysis/CostModel/ARM/memcpy.ll @@ -3,11 +3,469 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv7m-arm-unknown-eabi" -define void @memcpy(i8* %d, i8* %s, i32 %N) { +;;;;;;;;;;;; +; Align 1, 1 +;;;;;;;;;;;; + +define void @memcpy_1(i8* %d, i8* %s) { +; +; ldrb r1, [r1] +; strb r1, [r0] +; +; CHECK: function 'memcpy_1' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 1 %s, i32 1, i1 false) + ret void +} + +define void @memcpy_2(i8* %d, i8* %s) { +; +; ldrh r1, [r1] +; strh r1, [r0] +; +; CHECK: function 'memcpy_2' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 1 %s, i32 2, i1 false) + ret void +} + +define void @memcpy_3(i8* %d, i8* %s) { +; +; ldrb r2, [r1, #2] +; strb r2, [r0, #2] +; ldrh r1, [r1] +; strh r1, [r0] +; +; CHECK: function 'memcpy_3' +; CHECK-NEXT: cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 1 %s, i32 3, i1 false) + ret void +} + +define void @memcpy_4(i8* %d, i8* %s) { +; +; ldr r1, [r1] +; str r1, [r0] +; +; CHECK: function 'memcpy_4' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 1 %s, i32 4, i1 false) + ret void +} + +define void @memcpy_8(i8* %d, i8* %s) { +; +; ldr r2, [r1] +; ldr r1, [r1, #4] +; str r1, [r0, #4] +; str r2, [r0] +; +; CHECK: function 'memcpy_8' +; CHECK-NEXT: cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; entry: -; CHECK: cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 - call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 1 %s, i32 36, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 1 %s, i32 8, i1 false) ret void } +define void @memcpy_16(i8* %d, i8* %s) { +; +; ldr.w r12, [r1] +; ldr r3, [r1, #4] +; ldr r2, [r1, #8] +; ldr r1, [r1, #12] +; str r1, [r0, #12] +; str r2, [r0, #8] +; str r3, [r0, #4] +; str.w r12, [r0] +; +; CHECK: function 'memcpy_16' +; CHECK-NEXT: cost of 8 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 1 %s, i32 16, i1 false) + ret void +} + +define void @memcpy_32(i8* %d, i8* %s, i32 %N) { +; +; movs r2, #32 +; bl __aeabi_memcpy +; +; CHECK: function 'memcpy_32' +; CHECK-NEXT: cost of 6 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 1 %s, i32 32, i1 false) + ret void +} + +define void @memcpy_N(i8* %d, i8* %s, i32 %N) { +; +; bl __aeabi_memcpy +; +; CHECK: function 'memcpy_N' +; CHECK-NEXT: cost of 6 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 1 %s, i32 %N, i1 false) + ret void +} + +;;;;;;;;;;;;; +; Align 2, 2 +;;;;;;;;;;;;; + +define void @memcpy_1_al2(i8* %d, i8* %s) { +; +; ldrb r1, [r1] +; strb r1, [r0] +; +; CHECK: function 'memcpy_1_al2' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %d, i8* align 2 %s, i32 1, i1 false) + ret void +} + +define void @memcpy_2_al2(i8* %d, i8* %s) { +; +; ldrh r1, [r1] +; strh r1, [r0] +; +; CHECK: function 'memcpy_2_al2' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %d, i8* align 2 %s, i32 2, i1 false) + ret void +} + +define void @memcpy_3_al2(i8* %d, i8* %s) { +; +; ldrb r2, [r1, #2] +; strb r2, [r0, #2] +; ldrh r1, [r1] +; strh r1, [r0] +; +; CHECK: function 'memcpy_3_al2' +; CHECK-NEXT: cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %d, i8* align 2 %s, i32 3, i1 false) + ret void +} + +define void @memcpy_4_al2(i8* %d, i8* %s) { +; +; ldr r1, [r1] +; str r1, [r0] +; +; CHECK: function 'memcpy_4_al2' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %d, i8* align 2 %s, i32 4, i1 false) + ret void +} + +define void @memcpy_8_al2(i8* %d, i8* %s) { +; +; ldr r2, [r1] +; ldr r1, [r1, #4] +; str r1, [r0, #4] +; str r2, [r0] +; +; CHECK: function 'memcpy_8_al2' +; CHECK-NEXT: cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %d, i8* align 2 %s, i32 8, i1 false) + ret void +} + +define void @memcpy_16_al2(i8* %d, i8* %s) { +; +; ldr.w r12, [r1] +; ldr r3, [r1, #4] +; ldr r2, [r1, #8] +; ldr r1, [r1, #12] +; str r1, [r0, #12] +; str r2, [r0, #8] +; str r3, [r0, #4] +; str.w r12, [r0] +; +; CHECK: function 'memcpy_16_al2' +; CHECK-NEXT: cost of 8 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %d, i8* align 2 %s, i32 16, i1 false) + ret void +} + +define void @memcpy_32_al2(i8* %d, i8* %s, i32 %N) { +; +; movs r2, #32 +; bl __aeabi_memcpy +; +; CHECK: function 'memcpy_32_al2' +; CHECK-NEXT: cost of 6 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %d, i8* align 2 %s, i32 32, i1 false) + ret void +} + +define void @memcpy_N_al2(i8* %d, i8* %s, i32 %N) { +; +; bl __aeabi_memcpy +; +; CHECK: function 'memcpy_N_al2' +; CHECK-NEXT: cost of 6 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 %d, i8* align 2 %s, i32 %N, i1 false) + ret void +} + +;;;;;;;;;;;;; +; Align 4, 4 +;;;;;;;;;;;;; + +define void @memcpy_1_al4(i8* %d, i8* %s) { +; +; ldrb r1, [r1] +; strb r1, [r0] +; +; CHECK: function 'memcpy_1_al4' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %d, i8* align 4 %s, i32 1, i1 false) + ret void +} + +define void @memcpy_2_al4(i8* %d, i8* %s) { +; +; ldrh r1, [r1] +; strh r1, [r0] +; +; CHECK: function 'memcpy_2_al4' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %d, i8* align 4 %s, i32 2, i1 false) + ret void +} + +define void @memcpy_3_al4(i8* %d, i8* %s) { +; +; ldrb r2, [r1, #2] +; strb r2, [r0, #2] +; ldrh r1, [r1] +; strh r1, [r0] +; +; CHECK: function 'memcpy_3_al4' +; CHECK-NEXT: cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %d, i8* align 4 %s, i32 3, i1 false) + ret void +} + +define void @memcpy_4_al4(i8* %d, i8* %s) { +; +; ldr r1, [r1] +; str r1, [r0] +; +; CHECK: function 'memcpy_4_al4' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %d, i8* align 4 %s, i32 4, i1 false) + ret void +} + +define void @memcpy_8_al4(i8* %d, i8* %s) { +; +; ldrd r2, r1, [r1] +; strd r2, r1, [r0] +; +; CHECK: function 'memcpy_8_al4' +; CHECK-NEXT: cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %d, i8* align 4 %s, i32 8, i1 false) + ret void +} + +define void @memcpy_16_al4(i8* %d, i8* %s) { +; +; ldm.w r1, {r2, r3, r12} +; ldr r1, [r1, #12] +; stm.w r0, {r2, r3, r12} +; str r1, [r0, #12] +; +; CHECK: function 'memcpy_16_al4' +; CHECK-NEXT: cost of 8 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %d, i8* align 4 %s, i32 16, i1 false) + ret void +} + +define void @memcpy_32_al4(i8* %d, i8* %s, i32 %N) { +; +; ldm.w r1!, {r2, r3, r12, lr} +; stm.w r0!, {r2, r3, r12, lr} +; ldm.w r1, {r2, r3, r12, lr} +; stm.w r0, {r2, r3, r12, lr} +; +; CHECK: function 'memcpy_32_al4' +; CHECK-NEXT: cost of 16 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %d, i8* align 4 %s, i32 32, i1 false) + ret void +} + +define void @memcpy_N_al4(i8* %d, i8* %s, i32 %N) { +; +; bl __aeabi_memcpy4 +; +; CHECK: function 'memcpy_N_al4' +; CHECK-NEXT: cost of 6 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %d, i8* align 4 %s, i32 %N, i1 false) + ret void +} + +;;;;;;;;;;;;; +; Align 1, 4 +;;;;;;;;;;;;; + +define void @memcpy_1_al14(i8* %d, i8* %s) { +; +; ldrb r1, [r1] +; strb r1, [r0] +; +; CHECK: function 'memcpy_1_al14' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 4 %s, i32 1, i1 false) + ret void +} + +define void @memcpy_2_al14(i8* %d, i8* %s) { +; +; ldrh r1, [r1] +; strh r1, [r0] +; +; CHECK: function 'memcpy_2_al14' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 4 %s, i32 2, i1 false) + ret void +} + +define void @memcpy_3_al14(i8* %d, i8* %s) { +; +; ldrb r2, [r1, #2] +; strb r2, [r0, #2] +; ldrh r1, [r1] +; strh r1, [r0] +; +; CHECK: function 'memcpy_3_al14' +; CHECK-NEXT: cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 4 %s, i32 3, i1 false) + ret void +} + +define void @memcpy_4_al14(i8* %d, i8* %s) { +; +; ldr r1, [r1] +; str r1, [r0] +; +; CHECK: function 'memcpy_4_al14' +; CHECK-NEXT: cost of 2 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 4 %s, i32 4, i1 false) + ret void +} + +define void @memcpy_8_al14(i8* %d, i8* %s) { +; +; ldr r2, [r1] +; ldr r1, [r1, #4] +; str r1, [r0, #4] +; str r2, [r0] +; +; CHECK: function 'memcpy_8_al14' +; CHECK-NEXT: cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 4 %s, i32 8, i1 false) + ret void +} + +define void @memcpy_16_al14(i8* %d, i8* %s) { +; +; ldr.w r12, [r1] +; ldr r3, [r1, #4] +; ldr r2, [r1, #8] +; ldr r1, [r1, #12] +; str r1, [r0, #12] +; str r2, [r0, #8] +; str r3, [r0, #4] +; str.w r12, [r0] +; +; CHECK: function 'memcpy_16_al14' +; CHECK-NEXT: cost of 8 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 4 %s, i32 16, i1 false) + ret void +} + +define void @memcpy_32_al14(i8* %d, i8* %s) { +; +; movs r2, #32 +; bl __aeabi_memcpy +; +; CHECK: function 'memcpy_32_al14' +; CHECK-NEXT: cost of 6 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 4 %s, i32 32, i1 false) + ret void +} + +define void @memcpy_N_al14(i8* %d, i8* %s, i32 %N) { +; +; bl __aeabi_memcpy4 +; +; CHECK: function 'memcpy_N_al14' +; CHECK-NEXT: cost of 6 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32 +; +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %d, i8* align 4 %s, i32 %N, i1 false) + ret void +} + + declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1) #1