Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -790,6 +790,44 @@ /// remove the old. bool useWideIRMemcpyLoopLowering() const; + /// Information about the desired sizes for an inlined fast-path mem library + /// function. + struct MemOpFastPathSizeInfo { + /// The size of operation within the loop in number of bytes. + /// + /// This must always be a power of two. + int OpByteSize; + + /// The maximum number of iterations where a loop is profitable. + int MaxIterations; + }; + + /// Computes the minimum size and best granularity for emitting a fast-path + /// loop to bypass a memset library call. + /// + /// For small sizes, a raw loop may be substantially faster than calling + /// memset. This routine tells LLVM up to what size this is profitable and + /// what the step size of the loop should be. The \p MaxOpByteSize is + /// provided by analyzing the alignment and size passed to the memset. + /// + /// A zero for `MaxIterations` in the returned struct effectively disables + /// inline fast-paths for the target. + MemOpFastPathSizeInfo + getMemsetInlineFastPathSizeInfo(int MaxOpByteSize) const; + + /// Computes the minimum size and best granularity for emitting a fast-path + /// loop to bypass a memcpy library call. + /// + /// For small sizes, a raw loop may be substantially faster than calling + /// memset. This routine tells LLVM up to what size this is profitable and + /// what the step size of the loop should be. The \p MaxOpByteSize is + /// provided by analyzing the alignment and size passed to the memset. + /// + /// A zero for `MaxIterations` in the returned struct effectively disables + /// inline fast-paths for the target. + MemOpFastPathSizeInfo + getMemcpyInlineFastPathSizeInfo(int MaxOpByteSize) const; + /// \returns True if the two functions have compatible attributes for inlining /// purposes. bool areInlineCompatible(const Function *Caller, @@ -999,6 +1037,10 @@ virtual void getMemcpyLoopResidualLoweringType( SmallVectorImpl &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const = 0; + virtual MemOpFastPathSizeInfo + getMemsetInlineFastPathSizeInfo(int MaxOpByteSize) const = 0; + virtual MemOpFastPathSizeInfo + getMemcpyInlineFastPathSizeInfo(int MaxOpByteSize) const = 0; virtual bool areInlineCompatible(const Function *Caller, const Function *Callee) const = 0; virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0; @@ -1332,6 +1374,14 @@ Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes, SrcAlign, DestAlign); } + MemOpFastPathSizeInfo + getMemsetInlineFastPathSizeInfo(int MaxOpByteSize) const override { + return Impl.getMemsetInlineFastPathSizeInfo(MaxOpByteSize); + } + MemOpFastPathSizeInfo + getMemcpyInlineFastPathSizeInfo(int MaxOpByteSize) const override { + return Impl.getMemcpyInlineFastPathSizeInfo(MaxOpByteSize); + } bool areInlineCompatible(const Function *Caller, const Function *Callee) const override { return Impl.areInlineCompatible(Caller, Callee); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -464,6 +464,14 @@ OpsOut.push_back(Type::getInt8Ty(Context)); } + TTI::MemOpFastPathSizeInfo getMemsetInlineFastPathSizeInfo(int) const { + return {0, 0}; + } + + TTI::MemOpFastPathSizeInfo getMemcpyInlineFastPathSizeInfo(int) const { + return {0, 0}; + } + bool areInlineCompatible(const Function *Caller, const Function *Callee) const { return (Caller->getFnAttribute("target-cpu") == Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -131,6 +131,7 @@ void initializeExpandReductionsPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); void initializeFEntryInserterPass(PassRegistry&); +void initializeFastPathLibCallsLegacyPassPass(PassRegistry&); void initializeFinalizeMachineBundlesPass(PassRegistry&); void initializeFlattenCFGPassPass(PassRegistry&); void initializeFloat2IntLegacyPassPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -45,6 +45,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/ObjCARC.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/FastPathLibCalls.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" @@ -99,6 +100,7 @@ (void) llvm::createPGOIndirectCallPromotionLegacyPass(); (void) llvm::createPGOMemOPSizeOptLegacyPass(); (void) llvm::createInstrProfilingLegacyPass(); + (void) llvm::createFastPathLibCallsLegacyPass(); (void) llvm::createFunctionImportPass(); (void) llvm::createFunctionInliningPass(); (void) llvm::createAlwaysInlinerLegacyPass(); Index: include/llvm/Transforms/Scalar/FastPathLibCalls.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Scalar/FastPathLibCalls.h @@ -0,0 +1,39 @@ +//===- FastPathLibCalls.h - Insert fast-path code for lib calls -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_FASTPATHLIBCALLS_H +#define LLVM_TRANSFORMS_SCALAR_FASTPATHLIBCALLS_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// Pass that injects fast-path code for calls to known library functions. +/// +/// Inject a fast-path code sequence for known library functions where +/// profitable. This pass specifically targets library functions with common +/// code paths that can be profitably "inlined", potentially behind a dynamic +/// test, rather than calling the library function. +/// +/// For example, rather than call `memcpy` with a size of 16 bytes, if the size +/// is known to be a multiple of 8 and the pointers suitable aligned we can +/// emit a simple loop for short sizes that will run substantially faster than +/// calling out to a library function. With profile information, we can even +/// adjust thresholds and emit weights on the branches. +class FastPathLibCallsPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +/// Create a legacy pass analogous to `FastPathLibCallsPass` above. +Pass *createFastPathLibCallsLegacyPass(); + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_FASTPATHLIBCALLS_H Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -517,6 +517,16 @@ return UseWideMemcpyLoopLowering; } +TargetTransformInfo::MemOpFastPathSizeInfo +TargetTransformInfo::getMemsetInlineFastPathSizeInfo(int MaxOpByteSize) const { + return TTIImpl->getMemsetInlineFastPathSizeInfo(MaxOpByteSize); +} + +TargetTransformInfo::MemOpFastPathSizeInfo +TargetTransformInfo::getMemcpyInlineFastPathSizeInfo(int MaxOpByteSize) const { + return TTIImpl->getMemcpyInlineFastPathSizeInfo(MaxOpByteSize); +} + bool TargetTransformInfo::areInlineCompatible(const Function *Caller, const Function *Callee) const { return TTIImpl->areInlineCompatible(Caller, Callee); Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -93,6 +93,7 @@ #include "llvm/Transforms/Scalar/DCE.h" #include "llvm/Transforms/Scalar/DeadStoreElimination.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Scalar/FastPathLibCalls.h" #include "llvm/Transforms/Scalar/Float2Int.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/GuardWidening.h" @@ -743,6 +744,9 @@ // alignment information, try to re-derive it here. OptimizePM.addPass(AlignmentFromAssumptionsPass()); + // Insert fast-path bypasses for library functions. + OptimizePM.addPass(FastPathLibCallsPass()); + // LoopSink pass sinks instructions hoisted by LICM, which serves as a // canonicalization pass that enables other optimizations. As a result, // LoopSink pass needs to be a very late IR pass to avoid undoing LICM Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -151,6 +151,7 @@ FUNCTION_PASS("instcombine", InstCombinePass()) FUNCTION_PASS("instsimplify", InstSimplifierPass()) FUNCTION_PASS("invalidate", InvalidateAllAnalysesPass()) +FUNCTION_PASS("fast-path-lib-calls", FastPathLibCallsPass()) FUNCTION_PASS("float2int", Float2IntPass()) FUNCTION_PASS("no-op-function", NoOpFunctionPass()) FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass()) Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -108,6 +108,10 @@ bool isLegalMaskedStore(Type *DataType); bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); + TargetTransformInfo::MemOpFastPathSizeInfo + getMemsetInlineFastPathSizeInfo(int MaxOpByteSize) const; + TargetTransformInfo::MemOpFastPathSizeInfo + getMemcpyInlineFastPathSizeInfo(int MaxOpByteSize) const; bool areInlineCompatible(const Function *Caller, const Function *Callee) const; bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2221,6 +2221,31 @@ return isLegalMaskedGather(DataType); } +TargetTransformInfo::MemOpFastPathSizeInfo +X86TTIImpl::getMemsetInlineFastPathSizeInfo(int MaxOpByteSize) const { + TTI::MemOpFastPathSizeInfo Result; + + // Cap the size at the word size. + // FIXME: If we teach the x86 backend to lower 128-bit (and wider) integer + // loads and stores using vectors, we may be able to grow this to encompass + // vector sizes and we'll need to adjust the iteration count below as well. + Result.OpByteSize = std::min(MaxOpByteSize, ST->is64Bit() ? 8 : 4); + + // On x86, a loop of up to six iterations remains profitable compared to the + // overhead of a library call unless more than 16 bytes are being touched at + // which point the vector based code in the library call is advantageous even + // after the overhead of the call itself. + Result.MaxIterations = std::min(6, 16 / Result.OpByteSize); + + return Result; +} + +TargetTransformInfo::MemOpFastPathSizeInfo +X86TTIImpl::getMemcpyInlineFastPathSizeInfo(int MaxOpByteSize) const { + // The heuristics for memset and memcpy are the same for x86. + return getMemsetInlineFastPathSizeInfo(MaxOpByteSize); +} + bool X86TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -8,6 +8,7 @@ DCE.cpp DeadStoreElimination.cpp EarlyCSE.cpp + FastPathLibCalls.cpp FlattenCFGPass.cpp Float2Int.cpp GuardWidening.cpp Index: lib/Transforms/Scalar/FastPathLibCalls.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/FastPathLibCalls.cpp @@ -0,0 +1,492 @@ +//===-- FastPathLibCalls.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/FastPathLibCalls.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +using namespace llvm; + +#define DEBUG_TYPE "fastpathlibcals" + +STATISTIC(NumFastPaths, "Number of inserted fast paths"); + +static cl::opt ForceMaxOpByteSize( + "fast-path-force-max-op-byte-size", cl::init(0), + cl::desc("Forces a specific max operation byte size for library function " + "fast paths, overriding the target."), + cl::Hidden); + +static cl::opt ForceMaxIterations( + "fast-path-force-max-iterations", cl::init(0), + cl::desc("Forces a specific max iterations for library " + "function fast paths, overriding the target. Only has an effect " + "if the max operation byte size is also forced."), + cl::Hidden); + +namespace { +class LibCallVisitor : private InstVisitor { + using BaseT = InstVisitor; + friend BaseT; + + const DataLayout &DL; + AssumptionCache &AC; + DominatorTree &DT; + TargetLibraryInfo &TLI; + TargetTransformInfo &TTI; + +public: + LibCallVisitor(const DataLayout &DL, AssumptionCache &AC, DominatorTree &DT, + TargetLibraryInfo &TLI, TargetTransformInfo &TTI) + : DL(DL), AC(AC), DT(DT), TLI(TLI), TTI(TTI) {} + + /// Visit every instruction in the function and return if any changes were + /// made. + /// + /// This hides the various entry points of the base class so that we can + /// implement our desired visit and return semantics. + bool visit(Function &F) { + bool Changed = false; + + // Loop somewhat carefully over the instructions as we will be moving them + // when making changes. + // FIXME: This is the worst possible way to iterate over instructions, but + // it doesn't crash when the instruction list mutates. + SmallVector Insts; + for (Instruction &I : instructions(F)) + Insts.push_back(&I); + for (Instruction *I : Insts) + Changed |= BaseT::visit(*I); + + return Changed; + } + +private: + // Base case implementation. + bool visitInstruction(Instruction &) { return false; } + + /// Checks whether a value is known non-zero at a particular location. + bool isKnownNonZero(Value *V, Instruction &I) { + // If value tracking knows enough, we're done. + if (llvm::isKnownNonZero(V, DL, /*Depth*/ 0, &AC, &I, &DT)) + return true; + + // Otherwise we implement a really lame version of PredicateInfo. + // FIXME: We should actually use PredicateInfo or some other more advanced + // mechanism to analyze predicates. + // + // The lame version simply walks up the dominator tree looking for branches + // on a test against zero where the non-zero edge dominates the location. + int Depth = 0; + for (DomTreeNode *N = DT.getNode(I.getParent()); N && Depth < 10; + N = N->getIDom()) { + auto *BB = N->getBlock(); + auto *BI = dyn_cast(BB->getTerminator()); + if (!BI || !BI->isConditional()) + continue; + + auto *Cmp = dyn_cast(BI->getCondition()); + if (!Cmp || Cmp->getOperand(0) != V || + Cmp->getOperand(1) != ConstantInt::get(V->getType(), 0)) + continue; + + BasicBlock *NonZeroBB; + switch (Cmp->getPredicate()) { + default: + llvm_unreachable("Invalid integer comparison predicate!"); + + case ICmpInst::ICMP_NE: + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SLT: + // Predicates where a match precludes equality with zero. + NonZeroBB = BI->getSuccessor(0); + break; + + case ICmpInst::ICMP_EQ: + case ICmpInst::ICMP_UGE: + case ICmpInst::ICMP_ULE: + case ICmpInst::ICMP_SGE: + case ICmpInst::ICMP_SLE: + // Predicates where failing to match precludes equality with zero. + NonZeroBB = BI->getSuccessor(1); + break; + } + + // If the non-zero edge dominates the instruction given, we have + // a non-zero predicate. + if (DT.dominates({BB, NonZeroBB}, I.getParent())) + return true; + } + + return false; + } + + struct FastPathMemOpFramework { + BasicBlock *HeadBB; + BasicBlock *IfBB; + BasicBlock *ThenBB; + BasicBlock *ElseBB; + BasicBlock *TailBB; + + // The memory operation byte size to use. + int OpByteSize; + + // The `count`, scaled by the op byte size used in the loop, and available + // within the `if` block. + Value *Count; + + // The `then` basic block contains a loop, and we make the index of that + // loop available here for use when populating a particular fast path. + PHINode *Index; + }; + + template + Optional + buildFastPathMemOpFramework(MemIntrinsic &I, CallableT GetSizeInfo) { + Optional FastPath = None; + + // First we analyze the IR looking for a good fastpath. + + // Try to match a scaling operation so we can use a coarser fast path. + Value *Count = I.getLength(); + auto *CountTy = Count->getType(); + int ShiftScale = 0; + ConstantInt *ShiftScaleC; + using namespace PatternMatch; + if (match(I.getLength(), + m_NUWShl(m_Value(Count), m_ConstantInt(ShiftScaleC)))) { + // Don't bother with shifts wider than the number of bits in count. + if (ShiftScaleC->getValue().uge(CountTy->getIntegerBitWidth())) + return None; + assert(I.getLength()->getType() == Count->getType() && + "Cannot change type with a shift!"); + ShiftScale = (int)ShiftScaleC->getValue().getZExtValue(); + } + + // Compute the alignment, mapping zero to the actual resulting alignment. + int Alignment = std::max(1, I.getAlignment()); + int MaxOpByteSize = std::min(1 << ShiftScale, Alignment); + + auto SizeInfo = GetSizeInfo(MaxOpByteSize); + + // For testing, we may have overrides for the TTI selected parameters. + if (ForceMaxOpByteSize.getNumOccurrences() > 0) { + SizeInfo.OpByteSize = std::min(MaxOpByteSize, ForceMaxOpByteSize); + SizeInfo.MaxIterations = ForceMaxIterations; + } + + // If we won't fast-path any iterations, bail. + if (SizeInfo.MaxIterations == 0) + return FastPath; + + assert(SizeInfo.OpByteSize <= Alignment && "Stores would be underaligned!"); + + // Otherwise build an actual fast path. + ++NumFastPaths; + FastPath = {}; + FastPath->OpByteSize = SizeInfo.OpByteSize; + FastPath->HeadBB = I.getParent(); + IRBuilder<> IRB(&I); + + // If necessary, check for zero and bypass everything. + if (!isKnownNonZero(Count, I)) { + auto *ZeroCond = cast( + IRB.CreateICmpNE(Count, ConstantInt::get(CountTy, 0), "zero_cond")); + TerminatorInst *IfTerm = SplitBlockAndInsertIfThen( + ZeroCond, &I, /*Unreachable*/ false, /*BranchWeights*/ nullptr, &DT); + FastPath->IfBB = IfTerm->getParent(); + FastPath->IfBB->setName(Twine(FastPath->HeadBB->getName()) + ".if"); + FastPath->TailBB = I.getParent(); + // Lift the operation into its basic block. + I.moveBefore(IfTerm); + } else { + FastPath->IfBB = FastPath->HeadBB; + FastPath->TailBB = SplitBlock(FastPath->HeadBB, + &*std::next(BasicBlock::iterator(I)), &DT); + } + FastPath->TailBB->setName(Twine(FastPath->HeadBB->getName()) + ".tail"); + IRB.SetInsertPoint(&I); + + // Adjust the count based on the op size we want for the loop. + auto AdjustCountAndShiftScaleForOpSize = + [&](Value *Count, Value *ByteSize, int ShiftScale, + int OpByteSize) -> std::pair { + assert(OpByteSize > 0 && isPowerOf2_32(OpByteSize) && + "Invalid operation byte size!"); + + // For one byte stores simply reset to the original byte size. + if (OpByteSize == 1) + return {ByteSize, 0}; + + // When the op shift scale matches, we don't need to adjust anything. + int OpShiftScale = Log2_32(OpByteSize); + if (ShiftScale == OpShiftScale) + return {Count, ShiftScale}; + + assert(ShiftScale > OpShiftScale && "Cannot have a wider op than shift!"); + + return {IRB.CreateShl( + Count, ShiftScale - OpShiftScale, "loop_count", + /*HasNUW*/ true, + /*HasNSW*/ cast(ByteSize)->hasNoSignedWrap()), + OpShiftScale}; + }; + std::tie(Count, ShiftScale) = AdjustCountAndShiftScaleForOpSize( + Count, I.getLength(), ShiftScale, SizeInfo.OpByteSize); + FastPath->Count = Count; + + // Now create the condition for using the fast path. + auto *Cond = cast(IRB.CreateICmpULE( + Count, ConstantInt::get(CountTy, SizeInfo.MaxIterations), + "count_cond")); + + // Split into an if-then-else FastPath based on the condition. + // FIXME: We should use profile information about the count (if available) + // to guide the metadata on this branch. + auto *ThenTerm = cast(SplitBlockAndInsertIfThen( + Cond, &I, /*Unreachable*/ false, /*BranchWeights*/ nullptr, &DT)); + + FastPath->ThenBB = ThenTerm->getParent(); + FastPath->ThenBB->setName(Twine(FastPath->HeadBB->getName()) + + ".fast_path_then"); + FastPath->ElseBB = I.getParent(); + FastPath->ElseBB->setName(Twine(FastPath->HeadBB->getName()) + + ".fast_path_else"); + + // Build the fast-path loop in the then block and save the index. + ThenTerm->eraseFromParent(); + IRB.SetInsertPoint(FastPath->ThenBB); + FastPath->Index = IRB.CreatePHI(CountTy, /*NumReservedValues*/ 2, "index"); + FastPath->Index->addIncoming(ConstantInt::get(CountTy, 0), FastPath->IfBB); + auto *NextIndex = IRB.CreateAdd(FastPath->Index, + ConstantInt::get(CountTy, 1), "next_index"); + auto *LoopCond = IRB.CreateICmpEQ(NextIndex, Count, "loop_cond"); + IRB.CreateCondBr(LoopCond, FastPath->TailBB, FastPath->ThenBB); + FastPath->Index->addIncoming(NextIndex, FastPath->ThenBB); + + // If the tail's current IDom is the else, we need to update it now that + // the then block directly connects to it. + DomTreeNode *TailN = DT.getNode(FastPath->TailBB); + if (TailN->getIDom()->getBlock() == FastPath->ElseBB) + DT.changeImmediateDominator(TailN, DT.getNode(FastPath->HeadBB)); + + return FastPath; + } + + bool visitMemCpyInst(MemCpyInst &I) { + if (I.isVolatile()) + return false; + Value *ByteSize = I.getLength(); + // Constant sizes don't need a fast path, we can code generate an optimal + // lowering. + if (isa(ByteSize)) + return false; + + auto FastPath = buildFastPathMemOpFramework(I, [&](int MaxOpByteSize) { + return TTI.getMemcpyInlineFastPathSizeInfo(MaxOpByteSize); + }); + if (!FastPath) + return false; + + // Now build the inner part of the fastpath for this rotine. + IRBuilder<> IRB(FastPath->ThenBB->getFirstNonPHI()); + + // Cast the pointer to the desired type. + IntegerType *ValTy = IRB.getIntNTy(FastPath->OpByteSize * 8); + PointerType *PtrTy = ValTy->getPointerTo(); + Value *Dst = PtrTy == I.getRawDest()->getType() + ? I.getRawDest() + : IRB.CreatePointerCast(I.getDest(), PtrTy, "dst.cast"); + Value *Src = PtrTy == I.getRawSource()->getType() + ? I.getRawSource() + : IRB.CreatePointerCast(I.getSource(), PtrTy, "src.cast"); + + // Build a store loop in the then block. + Value *Indices[] = {FastPath->Index}; + auto *IndexedDst = + IRB.CreateInBoundsGEP(ValTy, Dst, Indices, "indexed_dst"); + auto *IndexedSrc = + IRB.CreateInBoundsGEP(ValTy, Src, Indices, "indexed_src"); + IRB.CreateAlignedStore(IRB.CreateAlignedLoad(IndexedSrc, I.getAlignment()), + IndexedDst, I.getAlignment()); + + // Compute the byte size within that block to avoid computing it when + // possible. + if (FastPath->Count != ByteSize) + if (auto *ByteSizeI = dyn_cast(ByteSize)) + if (ByteSizeI->hasOneUse()) + ByteSizeI->moveBefore(&I); + + // Return that we changed the function. + return true; + } + + bool visitMemSetInst(MemSetInst &I) { + if (I.isVolatile()) + return false; + Value *ByteSize = I.getLength(); + // Constant sizes don't need a fast path, we can code generate an optimal + // lowering. + if (isa(ByteSize)) + return false; + + Value *V = I.getValue(); + assert(V->getType()->isIntegerTy(8) && "Non-i8 value in memset!"); + auto *CV = dyn_cast(V); + + auto FastPath = buildFastPathMemOpFramework(I, [&](int MaxOpByteSize) { + // If we don't have a constant value, forcibly cap the size to one so we + // don't need to scale it. + if (!CV) + MaxOpByteSize = 1; + + return TTI.getMemsetInlineFastPathSizeInfo(MaxOpByteSize); + }); + if (!FastPath) + return false; + + // Now build the inner part of the fastpath for this rotine. + IRBuilder<> IRB(FastPath->ThenBB->getFirstNonPHI()); + + // Scale up our value if necesasry. + if (FastPath->OpByteSize > 1) { + assert(CV && "Cannot scale up non-constant value!"); + + IntegerType *ScaledValTy = IRB.getIntNTy(FastPath->OpByteSize * 8); + APInt RawV = CV->getValue(); + if (RawV.getBitWidth() > 8) + RawV = RawV.trunc(8); + V = ConstantInt::get(ScaledValTy, + APInt::getSplat(FastPath->OpByteSize * 8, RawV)); + } + + // Cast the pointer to the desired type. + PointerType *DstTy = V->getType()->getPointerTo(); + Value *Dst = DstTy == I.getRawDest()->getType() + ? I.getRawDest() + : IRB.CreatePointerCast(I.getDest(), DstTy, "dst.cast"); + + // Add the store to the loop in the then block. + Value *Indices[] = {FastPath->Index}; + auto *IndexedDst = + IRB.CreateInBoundsGEP(V->getType(), Dst, Indices, "indexed_dst"); + IRB.CreateAlignedStore(V, IndexedDst, I.getAlignment()); + + // Compute the byte size within that block to avoid computing it when + // possible. + if (FastPath->Count != ByteSize) + if (auto *ByteSizeI = dyn_cast(ByteSize)) + if (ByteSizeI->hasOneUse()) + ByteSizeI->moveBefore(&I); + + // Return that we changed the function. + return true; + } + + bool visitCallSite(CallSite CS) { + LibFunc F; + if (!TLI.getLibFunc(CS, F)) + return false; + + switch (F) { + default: + // No fast-path logic. + return false; + } + } +}; +} // namespace + +static bool injectLibCallFastPaths(Function &F, AssumptionCache &AC, + DominatorTree &DT, TargetLibraryInfo &TLI, + TargetTransformInfo &TTI) { + if (F.optForSize()) + return false; + + return LibCallVisitor(F.getParent()->getDataLayout(), AC, DT, TLI, TTI) + .visit(F); +} + +PreservedAnalyses FastPathLibCallsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &AC = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &TLI = AM.getResult(F); + auto &TTI = AM.getResult(F); + + if (!injectLibCallFastPaths(F, AC, DT, TLI, TTI)) + return PreservedAnalyses::all(); + + DT.verifyDomTree(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +namespace { +struct FastPathLibCallsLegacyPass : public FunctionPass { + static char ID; + FastPathLibCallsLegacyPass() : FunctionPass(ID) { + initializeFastPathLibCallsLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto &AC = getAnalysis().getAssumptionCache(F); + auto &DT = getAnalysis().getDomTree(); + auto &TLI = getAnalysis().getTLI(); + auto &TTI = getAnalysis().getTTI(F); + + return injectLibCallFastPaths(F, AC, DT, TLI, TTI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + + AU.addPreserved(); + } +}; +} // namespace + +char FastPathLibCallsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(FastPathLibCallsLegacyPass, "fast-path-lib-calls", + "Fast Path Lib Calls", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(FastPathLibCallsLegacyPass, "fast-path-lib-calls", + "Fast Path Lib Calls", false, false) + +Pass *llvm::createFastPathLibCallsLegacyPass() { + return new FastPathLibCallsLegacyPass(); +} Index: test/Other/new-pm-defaults.ll =================================================================== --- test/Other/new-pm-defaults.ll +++ test/Other/new-pm-defaults.ll @@ -202,6 +202,7 @@ ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass +; CHECK-O-NEXT: Running pass: FastPathLibCallsPass ; CHECK-O-NEXT: Running pass: LoopSinkPass ; CHECK-O-NEXT: Running pass: InstSimplifierPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass Index: test/Other/new-pm-thinlto-defaults.ll =================================================================== --- test/Other/new-pm-thinlto-defaults.ll +++ test/Other/new-pm-thinlto-defaults.ll @@ -189,6 +189,7 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass ; CHECK-POSTLINK-O-NEXT: Running pass: AlignmentFromAssumptionsPass +; CHECK-POSTLINK-O-NEXT: Running pass: FastPathLibCallsPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifierPass ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass Index: test/Transforms/FastPathLibCalls/X86/lit.local.cfg =================================================================== --- /dev/null +++ test/Transforms/FastPathLibCalls/X86/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + Index: test/Transforms/FastPathLibCalls/X86/memops.ll =================================================================== --- /dev/null +++ test/Transforms/FastPathLibCalls/X86/memops.ll @@ -0,0 +1,275 @@ +; RUN: opt -S < %s -mtriple=x86_64-unknown-linux-gnu -passes=fast-path-lib-calls | FileCheck %s + +define void @set1(i8* %ptr, i64 %size) { +; CHECK-LABEL: define void @set1( +entry: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 6 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i8 15, i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @set2(i16* %ptr, i64 %size) { +; CHECK-LABEL: define void @set2( +entry: + %ptr.i8 = bitcast i16* %ptr to i8* + %size.scaled = shl nuw nsw i64 %size, 1 + call void @llvm.memset.p0i8.i64(i8* %ptr.i8, i8 15, i64 %size.scaled, i32 2, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[PTR_I8:.*]] = bitcast i16* %ptr to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 6 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i16, i16* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i16 3855, i16* %[[INDEXED_DST]], align 2 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %[[PTR_I8]], i8 15, i64 %[[SIZE_SCALED]], i32 2, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @set4(i32* %ptr, i64 %size) { +; CHECK-LABEL: define void @set4( +entry: + %ptr.i8 = bitcast i32* %ptr to i8* + %size.scaled = shl nuw nsw i64 %size, 2 + call void @llvm.memset.p0i8.i64(i8* %ptr.i8, i8 15, i64 %size.scaled, i32 4, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[PTR_I8:.*]] = bitcast i32* %ptr to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 4 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i32, i32* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i32 252645135, i32* %[[INDEXED_DST]], align 4 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 2 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %[[PTR_I8]], i8 15, i64 %[[SIZE_SCALED]], i32 4, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @set8(i64* %ptr, i64 %size) { +; CHECK-LABEL: define void @set8( +entry: + %ptr.i8 = bitcast i64* %ptr to i8* + %size.scaled = shl nuw nsw i64 %size, 3 + call void @llvm.memset.p0i8.i64(i8* %ptr.i8, i8 15, i64 %size.scaled, i32 8, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[PTR_I8:.*]] = bitcast i64* %ptr to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 2 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i64, i64* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i64 1085102592571150095, i64* %[[INDEXED_DST]], align 8 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 3 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %[[PTR_I8]], i8 15, i64 %[[SIZE_SCALED]], i32 8, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @copy1(i8* noalias %dst, i8* noalias %src, i64 %size) { +; CHECK-LABEL: define void @copy1( +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i32 1, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 6 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %dst, i64 %[[INDEX]] +; CHECK-NEXT: %[[INDEXED_SRC:.*]] = getelementptr inbounds i8, i8* %src, i64 %[[INDEX]] +; CHECK-NEXT: %[[LOAD:.*]] = load i8, i8* %[[INDEXED_SRC]], align 1 +; CHECK-NEXT: store i8 %[[LOAD]], i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @copy2(i16* noalias %dst, i16* noalias %src, i64 %size) { +; CHECK-LABEL: define void @copy2( +entry: + %dst.i8 = bitcast i16* %dst to i8* + %src.i8 = bitcast i16* %src to i8* + %size.scaled = shl nuw nsw i64 %size, 1 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst.i8, i8* %src.i8, i64 %size.scaled, i32 2, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[DST_I8:.*]] = bitcast i16* %dst to i8* +; CHECK-NEXT: %[[SRC_I8:.*]] = bitcast i16* %src to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 6 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i16, i16* %dst, i64 %[[INDEX]] +; CHECK-NEXT: %[[INDEXED_SRC:.*]] = getelementptr inbounds i16, i16* %src, i64 %[[INDEX]] +; CHECK-NEXT: %[[LOAD:.*]] = load i16, i16* %[[INDEXED_SRC]], align 2 +; CHECK-NEXT: store i16 %[[LOAD]], i16* %[[INDEXED_DST]], align 2 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 1 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[DST_I8]], i8* %[[SRC_I8]], i64 %[[SIZE_SCALED]], i32 2, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @copy4(i32* noalias %dst, i32* noalias %src, i64 %size) { +; CHECK-LABEL: define void @copy4( +entry: + %dst.i8 = bitcast i32* %dst to i8* + %src.i8 = bitcast i32* %src to i8* + %size.scaled = shl nuw nsw i64 %size, 2 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst.i8, i8* %src.i8, i64 %size.scaled, i32 4, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[DST_I8:.*]] = bitcast i32* %dst to i8* +; CHECK-NEXT: %[[SRC_I8:.*]] = bitcast i32* %src to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 4 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i32, i32* %dst, i64 %[[INDEX]] +; CHECK-NEXT: %[[INDEXED_SRC:.*]] = getelementptr inbounds i32, i32* %src, i64 %[[INDEX]] +; CHECK-NEXT: %[[LOAD:.*]] = load i32, i32* %[[INDEXED_SRC]], align 4 +; CHECK-NEXT: store i32 %[[LOAD]], i32* %[[INDEXED_DST]], align 4 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 2 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[DST_I8]], i8* %[[SRC_I8]], i64 %[[SIZE_SCALED]], i32 4, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @copy8(i64* noalias %dst, i64* noalias %src, i64 %size) { +; CHECK-LABEL: define void @copy8( +entry: + %dst.i8 = bitcast i64* %dst to i8* + %src.i8 = bitcast i64* %src to i8* + %size.scaled = shl nuw nsw i64 %size, 3 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst.i8, i8* %src.i8, i64 %size.scaled, i32 8, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[DST_I8:.*]] = bitcast i64* %dst to i8* +; CHECK-NEXT: %[[SRC_I8:.*]] = bitcast i64* %src to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 2 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i64, i64* %dst, i64 %[[INDEX]] +; CHECK-NEXT: %[[INDEXED_SRC:.*]] = getelementptr inbounds i64, i64* %src, i64 %[[INDEX]] +; CHECK-NEXT: %[[LOAD:.*]] = load i64, i64* %[[INDEXED_SRC]], align 8 +; CHECK-NEXT: store i64 %[[LOAD]], i64* %[[INDEXED_DST]], align 8 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 3 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[DST_I8]], i8* %[[SRC_I8]], i64 %[[SIZE_SCALED]], i32 8, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* writeonly, i8*, i64, i32, i1) + +declare void @llvm.memset.p0i8.i64(i8* writeonly, i8, i64, i32, i1) Index: test/Transforms/FastPathLibCalls/basic.ll =================================================================== --- /dev/null +++ test/Transforms/FastPathLibCalls/basic.ll @@ -0,0 +1,54 @@ +; RUN: opt -S < %s -passes='fast-path-lib-calls,verify' -fast-path-force-max-op-byte-size=4 -fast-path-force-max-iterations=3 | FileCheck %s + +define void @baseline(i8* %ptr, i64 %size) { +; CHECK-LABEL: define void @baseline( +entry: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i8 15, i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +; Negative tests where we shouldn't do anything. + +define void @optsize(i8* %ptr, i64 %size) optsize { +; CHECK-LABEL: define void @optsize( +entry: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) +; CHECK-NEXT: ret void +} + +define void @minsize(i8* %ptr, i64 %size) minsize { +; CHECK-LABEL: define void @minsize( +entry: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) +; CHECK-NEXT: ret void +} + +declare void @llvm.memset.p0i8.i64(i8* writeonly, i8, i64, i32, i1) Index: test/Transforms/FastPathLibCalls/memops.ll =================================================================== --- /dev/null +++ test/Transforms/FastPathLibCalls/memops.ll @@ -0,0 +1,535 @@ +; RUN: opt -S < %s -passes='fast-path-lib-calls,verify' -fast-path-force-max-op-byte-size=4 -fast-path-force-max-iterations=3 | FileCheck %s + +define void @set1(i8* %ptr, i64 %size) { +; CHECK-LABEL: define void @set1( +entry: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i8 15, i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @set1_nonzero1(i8* %ptr, i64 %size) { +; CHECK-LABEL: define void @set1_nonzero1( +entry: + %zero_cond = icmp eq i64 %size, 0 + br i1 %zero_cond, label %exit, label %call +; CHECK: entry: +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp eq i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %exit, label %call + +call: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + ret void +; CHECK: call: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %call ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i8 15, i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void + +exit: + ret void +} + +define void @set1_nonzero2(i8* %ptr, i64 %size) { +; CHECK-LABEL: define void @set1_nonzero2( +entry: + %zero_cond = icmp sgt i64 %size, 0 + br i1 %zero_cond, label %call, label %exit +; CHECK: entry: +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp sgt i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %call, label %exit + +call: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + ret void +; CHECK: call: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %call ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i8 15, i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void + +exit: + ret void +} + +; Somewhat silly degenerate case -- the 'call' block is dead technically, but, +; indeed, size will not be zero in that block. +define void @set1_nonzero3(i8* %ptr, i64 %size) { +; CHECK-LABEL: define void @set1_nonzero3( +entry: + %zero_cond = icmp uge i64 %size, 0 + br i1 %zero_cond, label %exit, label %call +; CHECK: entry: +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp uge i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %exit, label %call + +call: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + ret void +; CHECK: call: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %call ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i8 15, i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void + +exit: + ret void +} + +define void @set1_nonzero4(i8* %ptr, i64 %size, i1* %flag.ptr) { +; CHECK-LABEL: define void @set1_nonzero4( +entry: + %zero_cond = icmp ne i64 %size, 0 + br i1 %zero_cond, label %loop.ph, label %exit +; CHECK: entry: +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %loop.ph, label %exit + +loop.ph: + br label %loop.header + +loop.header: + %flag = load i1, i1* %flag.ptr + br i1 %flag, label %call, label %loop.exit + +call: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + br label %loop.header +; CHECK: call: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %call ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i8 15, i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: br label %loop.header + +loop.exit: + ret void + +exit: + ret void +} + +define void @set2(i16* %ptr, i64 %size) { +; CHECK-LABEL: define void @set2( +entry: + %ptr.i8 = bitcast i16* %ptr to i8* + %size.scaled = shl nuw nsw i64 %size, 1 + call void @llvm.memset.p0i8.i64(i8* %ptr.i8, i8 15, i64 %size.scaled, i32 2, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[PTR_I8:.*]] = bitcast i16* %ptr to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i16, i16* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i16 3855, i16* %[[INDEXED_DST]], align 2 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %[[PTR_I8]], i8 15, i64 %[[SIZE_SCALED]], i32 2, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @set2_align1(i16* %ptr, i64 %size) { +; CHECK-LABEL: define void @set2_align1( +entry: + %ptr.i8 = bitcast i16* %ptr to i8* + %size.scaled = shl nuw nsw i64 %size, 1 + call void @llvm.memset.p0i8.i64(i8* %ptr.i8, i8 15, i64 %size.scaled, i32 1, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[PTR_I8:.*]] = bitcast i16* %ptr to i8* +; CHECK-NEXT: %[[COUNT:.*]] = shl nuw nsw i64 %size, 1 +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %[[COUNT]], 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %[[PTR_I8]], i64 %[[INDEX]] +; CHECK-NEXT: store i8 15, i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %[[COUNT]] +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %[[PTR_I8]], i8 15, i64 %[[COUNT]], i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +; The memory intrinsics are weird and allow a 'zero' alignment that we need to +; handle correctly. +define void @set2_align0(i16* %ptr, i64 %size) { +; CHECK-LABEL: define void @set2_align0( +entry: + %ptr.i8 = bitcast i16* %ptr to i8* + %size.scaled = shl nuw nsw i64 %size, 1 + call void @llvm.memset.p0i8.i64(i8* %ptr.i8, i8 15, i64 %size.scaled, i32 0, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[PTR_I8:.*]] = bitcast i16* %ptr to i8* +; CHECK-NEXT: %[[COUNT:.*]] = shl nuw nsw i64 %size, 1 +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %[[COUNT]], 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %[[PTR_I8]], i64 %[[INDEX]] +; CHECK-NEXT: store i8 15, i8* %[[INDEXED_DST]] +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %[[COUNT]] +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %[[PTR_I8]], i8 15, i64 %[[COUNT]], i32 0, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @set4(i32* %ptr, i64 %size) { +; CHECK-LABEL: define void @set4( +entry: + %ptr.i8 = bitcast i32* %ptr to i8* + %size.scaled = shl nuw nsw i64 %size, 2 + call void @llvm.memset.p0i8.i64(i8* %ptr.i8, i8 15, i64 %size.scaled, i32 4, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[PTR_I8:.*]] = bitcast i32* %ptr to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i32, i32* %ptr, i64 %[[INDEX]] +; CHECK-NEXT: store i32 252645135, i32* %[[INDEXED_DST]], align 4 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 2 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %[[PTR_I8]], i8 15, i64 %[[SIZE_SCALED]], i32 4, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @set8(i64* %ptr, i64 %size) { +; CHECK-LABEL: define void @set8( +entry: + %ptr.i8 = bitcast i64* %ptr to i8* + %size.scaled = shl nuw nsw i64 %size, 3 + call void @llvm.memset.p0i8.i64(i8* %ptr.i8, i8 15, i64 %size.scaled, i32 8, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[PTR_I8:.*]] = bitcast i64* %ptr to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT:.*]] = shl nuw nsw i64 %size, 1 +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %[[COUNT]], 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[PTR_I32:.*]] = bitcast i64* %ptr to i32* +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i32, i32* %[[PTR_I32]], i64 %[[INDEX]] +; CHECK-NEXT: store i32 252645135, i32* %[[INDEXED_DST]], align 8 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %[[COUNT]] +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 3 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %[[PTR_I8]], i8 15, i64 %[[SIZE_SCALED]], i32 8, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define i64 @set8_reuse_scaled_size(i64* %ptr, i64 %size) { +; CHECK-LABEL: define i64 @set8_reuse_scaled_size( +entry: + %ptr.i8 = bitcast i64* %ptr to i8* + %size.scaled = shl nuw nsw i64 %size, 3 + call void @llvm.memset.p0i8.i64(i8* %ptr.i8, i8 15, i64 %size.scaled, i32 8, i1 false) + ret i64 %size.scaled +; CHECK: entry: +; CHECK-NEXT: %[[PTR_I8:.*]] = bitcast i64* %ptr to i8* +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 3 +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT:.*]] = shl nuw nsw i64 %size, 1 +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %[[COUNT]], 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[PTR_I32:.*]] = bitcast i64* %ptr to i32* +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i32, i32* %[[PTR_I32]], i64 %[[INDEX]] +; CHECK-NEXT: store i32 252645135, i32* %[[INDEXED_DST]], align 8 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %[[COUNT]] +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %[[PTR_I8]], i8 15, i64 %[[SIZE_SCALED]], i32 8, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret i64 %[[SIZE_SCALED]] +} + +define void @copy1(i8* noalias %dst, i8* noalias %src, i64 %size) { +; CHECK-LABEL: define void @copy1( +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i32 1, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i8, i8* %dst, i64 %[[INDEX]] +; CHECK-NEXT: %[[INDEXED_SRC:.*]] = getelementptr inbounds i8, i8* %src, i64 %[[INDEX]] +; CHECK-NEXT: %[[LOAD:.*]] = load i8, i8* %[[INDEXED_SRC]], align 1 +; CHECK-NEXT: store i8 %[[LOAD]], i8* %[[INDEXED_DST]], align 1 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i32 1, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @copy2(i16* noalias %dst, i16* noalias %src, i64 %size) { +; CHECK-LABEL: define void @copy2( +entry: + %dst.i8 = bitcast i16* %dst to i8* + %src.i8 = bitcast i16* %src to i8* + %size.scaled = shl nuw nsw i64 %size, 1 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst.i8, i8* %src.i8, i64 %size.scaled, i32 2, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[DST_I8:.*]] = bitcast i16* %dst to i8* +; CHECK-NEXT: %[[SRC_I8:.*]] = bitcast i16* %src to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i16, i16* %dst, i64 %[[INDEX]] +; CHECK-NEXT: %[[INDEXED_SRC:.*]] = getelementptr inbounds i16, i16* %src, i64 %[[INDEX]] +; CHECK-NEXT: %[[LOAD:.*]] = load i16, i16* %[[INDEXED_SRC]], align 2 +; CHECK-NEXT: store i16 %[[LOAD]], i16* %[[INDEXED_DST]], align 2 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 1 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[DST_I8]], i8* %[[SRC_I8]], i64 %[[SIZE_SCALED]], i32 2, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @copy4(i32* noalias %dst, i32* noalias %src, i64 %size) { +; CHECK-LABEL: define void @copy4( +entry: + %dst.i8 = bitcast i32* %dst to i8* + %src.i8 = bitcast i32* %src to i8* + %size.scaled = shl nuw nsw i64 %size, 2 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst.i8, i8* %src.i8, i64 %size.scaled, i32 4, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[DST_I8:.*]] = bitcast i32* %dst to i8* +; CHECK-NEXT: %[[SRC_I8:.*]] = bitcast i32* %src to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %size, 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i32, i32* %dst, i64 %[[INDEX]] +; CHECK-NEXT: %[[INDEXED_SRC:.*]] = getelementptr inbounds i32, i32* %src, i64 %[[INDEX]] +; CHECK-NEXT: %[[LOAD:.*]] = load i32, i32* %[[INDEXED_SRC]], align 4 +; CHECK-NEXT: store i32 %[[LOAD]], i32* %[[INDEXED_DST]], align 4 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %size +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 2 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[DST_I8]], i8* %[[SRC_I8]], i64 %[[SIZE_SCALED]], i32 4, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +define void @copy8(i64* noalias %dst, i64* noalias %src, i64 %size) { +; CHECK-LABEL: define void @copy8( +entry: + %dst.i8 = bitcast i64* %dst to i8* + %src.i8 = bitcast i64* %src to i8* + %size.scaled = shl nuw nsw i64 %size, 3 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst.i8, i8* %src.i8, i64 %size.scaled, i32 8, i1 false) + ret void +; CHECK: entry: +; CHECK-NEXT: %[[DST_I8:.*]] = bitcast i64* %dst to i8* +; CHECK-NEXT: %[[SRC_I8:.*]] = bitcast i64* %src to i8* +; CHECK-NEXT: %[[ZERO_COND:.*]] = icmp ne i64 %size, 0 +; CHECK-NEXT: br i1 %[[ZERO_COND]], label %[[IF:.*]], label %[[TAIL:.*]] +; +; CHECK: [[IF]]: +; CHECK-NEXT: %[[COUNT:.*]] = shl nuw nsw i64 %size, 1 +; CHECK-NEXT: %[[COUNT_COND:.*]] = icmp ule i64 %[[COUNT]], 3 +; CHECK-NEXT: br i1 %[[COUNT_COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; +; CHECK: [[THEN]]: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %[[IF]] ], [ %[[NEXT_INDEX:.*]], %[[THEN]] ] +; CHECK-NEXT: %[[DST_I32:.*]] = bitcast i64* %dst to i32* +; CHECK-NEXT: %[[SRC_I32:.*]] = bitcast i64* %src to i32* +; CHECK-NEXT: %[[INDEXED_DST:.*]] = getelementptr inbounds i32, i32* %[[DST_I32]], i64 %[[INDEX]] +; CHECK-NEXT: %[[INDEXED_SRC:.*]] = getelementptr inbounds i32, i32* %[[SRC_I32]], i64 %[[INDEX]] +; CHECK-NEXT: %[[LOAD:.*]] = load i32, i32* %[[INDEXED_SRC]], align 8 +; CHECK-NEXT: store i32 %[[LOAD]], i32* %[[INDEXED_DST]], align 8 +; CHECK-NEXT: %[[NEXT_INDEX]] = add i64 %[[INDEX]], 1 +; CHECK-NEXT: %[[LOOP_COND:.*]] = icmp eq i64 %[[NEXT_INDEX]], %[[COUNT]] +; CHECK-NEXT: br i1 %[[LOOP_COND]], label %[[TAIL:.*]], label %[[THEN]] +; +; CHECK: [[ELSE]]: +; CHECK-NEXT: %[[SIZE_SCALED:.*]] = shl nuw nsw i64 %size, 3 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[DST_I8]], i8* %[[SRC_I8]], i64 %[[SIZE_SCALED]], i32 8, i1 false) +; CHECK-NEXT: br label %[[TAIL]] +; +; CHECK: [[TAIL]]: +; CHECK-NEXT: ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* writeonly, i8*, i64, i32, i1) + +declare void @llvm.memset.p0i8.i64(i8* writeonly, i8, i64, i32, i1)