Index: lib/Target/ARM/ARM.h =================================================================== --- lib/Target/ARM/ARM.h +++ lib/Target/ARM/ARM.h @@ -43,6 +43,7 @@ FunctionPass *createARMOptimizeBarriersPass(); FunctionPass *createThumb2SizeReductionPass( std::function Ftor = nullptr); +FunctionPass *createARMPadMemcpyPass(); void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, ARMAsmPrinter &AP); Index: lib/Target/ARM/ARMPadMemcpyPass.cpp =================================================================== --- /dev/null +++ lib/Target/ARM/ARMPadMemcpyPass.cpp @@ -0,0 +1,258 @@ +// ARMPadMemcpyPass.cpp - Pads destination and source of memecpy so that they +// take up a full word of bytes. + +#define DEBUG_TYPE "arm-pad-memcpy" + +#include "ARM.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +cl::opt DisableARMPadMemcpy("disable-arm-pad-memcpy"); + +namespace { + +class ARMPadMemcpyPass : public FunctionPass { +public: + static char ID; + explicit ARMPadMemcpyPass() : FunctionPass(ID) {} + + // Ideally I would call ARMSubtarget::getMaxInlineSizeThreshold() to get the + // right value but can't get a hold of a Subtarget object from a Functiion + // object. + // This value represents the maximum memcpy size allowed for inlining memcpy, + // if number of bytes is greater then library function will be called + // instead. + const unsigned MemcpyInliningLimit = 64; + + virtual bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { return "ARM Pad Memcpy"; } +}; + +char ARMPadMemcpyPass::ID = 0; + +static RegisterPass X("arm-pad-memcpy", + "Pad memcpy source and destination"); + +static bool IsCharArray(Type *t) { + const unsigned int CHAR_BIT_SIZE = 8; + return t && t->isArrayTy() && t->getArrayElementType()->isIntegerTy() && + t->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE; +} + +bool ARMPadMemcpyPass::runOnFunction(Function &F) { + if (DisableARMPadMemcpy) { + return false; + } + DEBUG(dbgs() << "Running ARMPadMemcpy on module " << F.getName() << "\n"); + + bool modified = false; + for (Function::iterator b = F.begin(); b != F.end(); ++b) { + for (BasicBlock::iterator i = b->begin(); i != b->end(); ++i) { + CallInst *CI = dyn_cast(i); + if (!CI) { + continue; + } + + Function *CallMemcpy = CI->getCalledFunction(); + // find out if the current call instruction is a call to llvm memcpy + // intrinsics + if (CallMemcpy == NULL || !CallMemcpy->isIntrinsic() || + CallMemcpy->getIntrinsicID() != Intrinsic::memcpy) { + continue; + } + + DEBUG(dbgs() << "Found call to strcpy/memcpy\n"); + + GEPOperator *destinationPtr = dyn_cast(CI->getArgOperand(0)); + GEPOperator *sourcePtr = dyn_cast(CI->getArgOperand(1)); + ConstantInt *bytesToCopy = dyn_cast(CI->getArgOperand(2)); + ConstantInt *isVolatile = dyn_cast(CI->getArgOperand(4)); + + if (!bytesToCopy) { + DEBUG(dbgs() << "Number of bytes to copy is null\n"); + continue; + } + + uint64_t numBytesToCopy = bytesToCopy->getZExtValue(); + + if (!destinationPtr) { + DEBUG(dbgs() << "Destination isn't a GEP operation\n"); + continue; + } + + if (!sourcePtr) { + DEBUG(dbgs() << "Source isn't a GEP operation\n"); + continue; + } + + if (!isVolatile || isVolatile->isOne()) { + DEBUG(dbgs() << "Not padding strings for this memcpy because it's " + "a volatile operations\n"); + continue; + } + + if (!(numBytesToCopy % 4)) { + DEBUG(dbgs() << "Bytes to copy in strcpy/memcpy is already word " + "aligned so nothing to do here.\n"); + continue; + } + + GlobalVariable *sourceVar = + dyn_cast(sourcePtr->getPointerOperand()); + if (!sourceVar) { + DEBUG(dbgs() << "Source pointer isn't a global constant variable.\n"); + continue; + } + + DEBUG(dbgs() << "Source is a pointer\n"); + if (!sourceVar->hasInitializer() || !sourceVar->isConstant() || + !sourceVar->hasLocalLinkage() || !sourceVar->hasGlobalUnnamedAddr()) { + DEBUG(dbgs() << "Source is not constant global, thus it's " + "mutable therefore it's not safe to pad\n"); + continue; + } + + ConstantDataArray *sourceDataArray = + dyn_cast(sourceVar->getInitializer()); + + if (!sourceDataArray || !IsCharArray(sourceDataArray->getType())) { + DEBUG(dbgs() << "source isn't a constant data array\n"); + continue; + } + + GetElementPtrInst *destinationGEP = + dyn_cast(destinationPtr); + if (!destinationGEP) { + DEBUG(dbgs() << "Destination isn't a GEP Instruction\n"); + continue; + } + + AllocaInst *alloca = + dyn_cast(destinationGEP->getPointerOperand()); + if (!alloca) { + DEBUG(dbgs() << "Destination isn't allocated on the stack.\n"); + continue; + } + + if (!alloca->isStaticAlloca()) { + DEBUG(dbgs() << "Destination allocation isn't a static " + "constant which is locally allocated in this " + "function, so skipping.\n"); + continue; + } + + // Make sure destination is definitley a char array. + if (!IsCharArray(alloca->getAllocatedType())) { + DEBUG(dbgs() << "Destination doesn't look like a constant char (8 " + "bits) array\n"); + continue; + } + + uint64_t dzSize = alloca->getAllocatedType()->getArrayNumElements(); + uint64_t szSize = sourceDataArray->getType()->getNumElements(); + + // For safety purposes lets add a constraint and only padd when + // num bytes to copy == destination array size == source string + // which is a constant + DEBUG(dbgs() << "Number of bytes to copy is: " << numBytesToCopy << "\n"); + DEBUG(dbgs() << "Size of destination array is: " << dzSize << "\n"); + DEBUG(dbgs() << "Size of source array is: " << szSize << "\n"); + if (numBytesToCopy != dzSize || dzSize != szSize) { + DEBUG(dbgs() << "Size of number of bytes to copy, destination " + "array and source string don't match, so " + "skipping\n"); + continue; + } + DEBUG(dbgs() << "Going to pad.\n"); + unsigned int numBytesToPad = 4 - (numBytesToCopy % 4); + DEBUG(dbgs() << "Number of bytes to pad by is " << numBytesToPad << "\n"); + unsigned int totalBytes = numBytesToCopy + numBytesToPad; + + if (totalBytes > MemcpyInliningLimit) { + DEBUG(dbgs() << "Not going to pad because total number of bytes is " + << totalBytes << " which be greater than the inlining " + "limit for memcpy which is " + << MemcpyInliningLimit << "\n"); + continue; + } + + // update destination char array to be word aligned (memcpy(X,...,...)) + IRBuilder<> buildAlloca(alloca); + AllocaInst *newAlloca = cast(buildAlloca.CreateAlloca( + ArrayType::get(alloca->getAllocatedType()->getArrayElementType(), + numBytesToCopy + numBytesToPad))); + newAlloca->takeName(alloca); + newAlloca->setAlignment(alloca->getAlignment()); + + DEBUG(dbgs() + << "Updating users of destination stack object to use new size\n"); + for (auto U : alloca->users()) { + GetElementPtrInst *gep = dyn_cast(U); + if (gep) { + IRBuilder<> buildGEP(gep); + GetElementPtrInst *newGEP = dyn_cast( + buildGEP.CreateGEP(nullptr, newAlloca, + {gep->getOperand(1), gep->getOperand(2)})); + newGEP->takeName(gep); + newGEP->setIsInBounds(gep->isInBounds()); + gep->replaceAllUsesWith(newGEP); + } + } + + // update source to be word aligned (memcpy(...,X,...)) + // create replacement string with padded null bytes. + StringRef data = sourceDataArray->getRawDataValues(); + std::vector strData(data.begin(), data.end()); + for (unsigned int p = 0; p < numBytesToPad; p++) + strData.push_back('\0'); + auto Arr = llvm::makeArrayRef(strData.data(), totalBytes); + + // create new padded version of global variable string. + Constant *sourceReplace = ConstantDataArray::get(F.getContext(), Arr); + GlobalVariable *newGV = new GlobalVariable( + *F.getParent(), sourceReplace->getType(), true, + sourceVar->getLinkage(), sourceReplace, sourceReplace->getName()); + + // copy any other attributes from original global variable string + // e.g. unamed_addr + newGV->copyAttributesFrom(sourceVar); + newGV->takeName(sourceVar); + + // create new expression for memcpy intrinsic to reference new global + // variable string. + Constant *replace = ConstantExpr::getInBoundsGetElementPtr( + cast(newGV->getType()->getScalarType()) + ->getContainedType(0u), + newGV, {sourcePtr->getOperand(1), sourcePtr->getOperand(2)}); + + // replace intrinsic source. + CI->setArgOperand(1, replace); + + // Update number of bytes to copy (memcpy(...,...,X)) + CI->setArgOperand(2, + ConstantInt::get(bytesToCopy->getType(), totalBytes)); + modified = true; + DEBUG(dbgs() << "Padded dest/source and increased number of bytes.\n"); + } + } + return modified; +} + +} // end of anonymous namespace + +FunctionPass *llvm::createARMPadMemcpyPass() { return new ARMPadMemcpyPass; } Index: lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- lib/Target/ARM/ARMTargetMachine.cpp +++ lib/Target/ARM/ARMTargetMachine.cpp @@ -369,6 +369,10 @@ else addPass(createAtomicExpandPass(TM)); + if (TM->getOptLevel() != CodeGenOpt::None) { + addPass(createARMPadMemcpyPass()); + } + // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. Index: lib/Target/ARM/CMakeLists.txt =================================================================== --- lib/Target/ARM/CMakeLists.txt +++ lib/Target/ARM/CMakeLists.txt @@ -45,6 +45,7 @@ Thumb2InstrInfo.cpp Thumb2SizeReduction.cpp ARMComputeBlockSize.cpp + ARMPadMemcpyPass.cpp ) add_subdirectory(TargetInfo) Index: test/CodeGen/ARM/arm-pad-memcpy-lengths-dont-match.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-pad-memcpy-lengths-dont-match.ll @@ -0,0 +1,36 @@ +; Test for padding memcpy's. This tests that the simple heuristic to decide +; whether to pad or not.The heuristic says to not pad when the destination +; of the memcpy isn't the same length as the source string to copy and the +; number of bytes to copy. + +; RUN: llc < %s -mtriple=arm-arm-none-eabi -O3 | FileCheck %s +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-arm-none-eabi" +@.str = private unnamed_addr constant [17 x i8] c"aaaaaaaaaaaaaaaa\00", align 1 + +; Function Attrs: nounwind +define hidden void @foo() local_unnamed_addr #0 { +entry: + %something = alloca [20 x i8], align 1 + %0 = getelementptr inbounds [20 x i8], [20 x i8]* %something, i32 0, i32 0 + call void @llvm.lifetime.start(i64 20, i8* nonnull %0) #3 + ; CHECK: ldm + ; CHECK: stm + ; CHECK-NEXT: ldrb + ; CHECK-NEXT: strb + call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull %0, i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str, i32 0, i32 0), i32 17, i32 1, i1 false) + %call2 = call i32 bitcast (i32 (...)* @bar to i32 (i8*)*)(i8* nonnull %0) #3 + call void @llvm.lifetime.end(i64 20, i8* nonnull %0) #3 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare i32 @bar(...) local_unnamed_addr #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1) #1 Index: test/CodeGen/ARM/arm-pad-memcpy-more-than-64-bytes.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-pad-memcpy-more-than-64-bytes.ll @@ -0,0 +1,33 @@ +; This tests the arm pad memcpy's pass. This tests checks if the padding +; pass doesn't pad when the copy is >64 bytes and instead calls the memcpy +; library function. + +; RUN: llc < %s -mtriple=arm-arm-none-eabi -O3 | FileCheck %s +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-arm-none-eabi" + +@.str = private unnamed_addr constant [65 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzz\00", align 1 + +; Function Attrs: nounwind +define hidden void @foo() local_unnamed_addr #0 { +entry: + %something = alloca [65 x i8], align 1 + %0 = getelementptr inbounds [65 x i8], [65 x i8]* %something, i32 0, i32 0 + call void @llvm.lifetime.start(i64 65, i8* nonnull %0) #3 + ; CHECK: __aeabi_memcpy + call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull %0, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 65, i32 1, i1 false) + %call2 = call i32 bitcast (i32 (...)* @bar to i32 (i8*)*)(i8* nonnull %0) #3 + call void @llvm.lifetime.end(i64 65, i8* nonnull %0) #3 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare i32 @bar(...) local_unnamed_addr #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1) #1 Index: test/CodeGen/ARM/arm-pad-memcpy-strings-test1.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-pad-memcpy-strings-test1.ll @@ -0,0 +1,44 @@ +; This tests the arm memcpy padding pass.This test checks that a 22 byte +; string is padded to 24 bytes which allows the instruction selector to +; select ldm/stm instructions for all for all of memcpy instead of using +; ldrh / strh to copy the last 2 bytes. This also tests that when the pass +; is turned off with option '-disable-arm-pad-memcpy' that ldrh / strh is +; used to copy the last 2 bytes. + +; RUN: llc < %s -mtriple thumbv6m-arm-none-eabi -O3 | FileCheck %s +; RUN: llc < %s -mtriple thumbv6m-arm-none-eabi -O3 -disable-arm-pad-memcpy | FileCheck %s --check-prefix=TURNED-OFF +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-none--eabi" + +@.str = private unnamed_addr constant [22 x i8] c"aaaaaaaaaaaaaaaaaaaaa\00", align 1 + +; Function Attrs: nounwind +define i32 @main() local_unnamed_addr #0 { +entry: + %a = alloca [22 x i8], align 1 + %0 = getelementptr inbounds [22 x i8], [22 x i8]* %a, i32 0, i32 0 + call void @llvm.lifetime.start(i64 22, i8* nonnull %0) #3 + ; CHECK: ldm + ; CHECK: stm + ; CHECK: ldm + ; CHECK: stm + ; CHECK-NOT: ldrh + ; CHECK-NOT: strh + ; TURNED-OFF: ldrh + ; TURNED-OFF: strh + call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull %0, i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str, i32 0, i32 0), i32 22, i32 1, i1 false) + %call2 = call i32 bitcast (i32 (...)* @foo to i32 (i8*)*)(i8* nonnull %0) #3 + call void @llvm.lifetime.end(i64 22, i8* nonnull %0) #3 + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare i32 @foo(...) local_unnamed_addr #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1) #1 Index: test/CodeGen/ARM/arm-pad-memcpy-strings-test2.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-pad-memcpy-strings-test2.ll @@ -0,0 +1,44 @@ +; This tests the arm memcpy padding pass.When the string to copy is 62 +; bytes the padding pass should pad by 2 bytes to make the copy use the +; full word allowing the instruction selector to use ldm / stm to copy the +; full string.If padding wasn't enabled the copy would use ldrh/strh to +; copy the last 2 bytes. + +; RUN: llc < %s -mtriple thumbv6m-arm-none-eabi -O3 | FileCheck %s +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-none--eabi" + +@.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1 + +; Function Attrs: nounwind +define i32 @main() local_unnamed_addr #0 { +entry: + %a = alloca [62 x i8], align 1 + %0 = getelementptr inbounds [62 x i8], [62 x i8]* %a, i32 0, i32 0 + call void @llvm.lifetime.start(i64 62, i8* nonnull %0) #3 + ; CHECK: ldm + ; CHECK-NEXT: stm + ; CHECK-NEXT: ldm + ; CHECK-NEXT: stm + ; CHECK-NEXT: ldm + ; CHECK-NEXT: stm + ; CHECK-NEXT: ldm + ; CHECK-NEXT: stm + ; CHECK-NOT: ldrh + ; CHECK-NOT: strh + call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull %0, i8* getelementptr inbounds ([62 x i8], [62 x i8]* @.str, i32 0, i32 0), i32 62, i32 1, i1 false) + %call2 = call i32 bitcast (i32 (...)* @foo to i32 (i8*)*)(i8* nonnull %0) #3 + call void @llvm.lifetime.end(i64 62, i8* nonnull %0) #3 + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare i32 @foo(...) local_unnamed_addr #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1) #1