Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -753,6 +753,28 @@ Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType) const; + /// \returns The type to use in a loop expansion of a memcpy call. + Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAlign, unsigned DestAlign) const; + + /// \param[out] OpsOut The operand types to copy RemainingBytes of memory. + /// \param RemainingBytes The number of bytes to copy. + /// + /// Calculates the operand types to use when copying \p RemainingBytes of + /// memory, where source and destination alignments are \p SrcAlign and + /// \p DestAlign respectively. + void getMemcpyLoopResidualLoweringType(SmallVectorImpl &OpsOut, + LLVMContext &Context, + unsigned RemainingBytes, + unsigned SrcAlign, + unsigned DestAlign) const; + + /// \returns True if we want to test the new memcpy lowering functionality in + /// Transform/Utils. + /// Temporary. Will be removed once we move to the new functionality and + /// remove the old. + bool useWideIRMemcpyLoopLowering() const; + /// \returns True if the two functions have compatible attributes for inlining /// purposes. bool areInlineCompatible(const Function *Caller, @@ -953,6 +975,12 @@ virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0; virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType) = 0; + virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAlign, + unsigned DestAlign) const = 0; + virtual void getMemcpyLoopResidualLoweringType( + SmallVectorImpl &OpsOut, LLVMContext &Context, + unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const = 0; virtual bool areInlineCompatible(const Function *Caller, const Function *Callee) const = 0; virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0; @@ -1266,6 +1294,19 @@ Type *ExpectedType) override { return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType); } + Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAlign, + unsigned DestAlign) const override { + return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAlign, DestAlign); + } + void getMemcpyLoopResidualLoweringType(SmallVectorImpl &OpsOut, + LLVMContext &Context, + unsigned RemainingBytes, + unsigned SrcAlign, + unsigned DestAlign) const override { + Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes, + SrcAlign, DestAlign); + } bool areInlineCompatible(const Function *Caller, const Function *Callee) const override { return Impl.areInlineCompatible(Caller, Callee); Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -444,6 +444,20 @@ return nullptr; } + Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAlign, unsigned DestAlign) const { + return Type::getInt8Ty(Context); + } + + void getMemcpyLoopResidualLoweringType(SmallVectorImpl &OpsOut, + LLVMContext &Context, + unsigned RemainingBytes, + unsigned SrcAlign, + unsigned DestAlign) const { + for (unsigned i = 0; i != RemainingBytes; ++i) + OpsOut.push_back(Type::getInt8Ty(Context)); + } + bool areInlineCompatible(const Function *Caller, const Function *Callee) const { return (Caller->getFnAttribute("target-cpu") == Index: llvm/trunk/include/llvm/Transforms/Utils/LowerMemIntrinsics.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/LowerMemIntrinsics.h +++ llvm/trunk/include/llvm/Transforms/Utils/LowerMemIntrinsics.h @@ -17,21 +17,39 @@ namespace llvm { +class ConstantInt; class Instruction; class MemCpyInst; class MemMoveInst; class MemSetInst; +class TargetTransformInfo; class Value; /// Emit a loop implementing the semantics of llvm.memcpy with the equivalent /// arguments at \p InsertBefore. -void createMemCpyLoop(Instruction *InsertBefore, - Value *SrcAddr, Value *DstAddr, Value *CopyLen, - unsigned SrcAlign, unsigned DestAlign, +void createMemCpyLoop(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, + Value *CopyLen, unsigned SrcAlign, unsigned DestAlign, bool SrcIsVolatile, bool DstIsVolatile); +/// Emit a loop implementing the semantics of llvm.memcpy where the size is not +/// a compile-time constant. Loop will be insterted at \p InsertBefore. +void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, + Value *DstAddr, Value *CopyLen, + unsigned SrcAlign, unsigned DestAlign, + bool SrcIsVolatile, bool DstIsVolatile, + const TargetTransformInfo &TTI); + +/// Emit a loop implementing the semantics of an llvm.memcpy whose size is a +/// compile time constant. Loop is inserted at \p InsertBefore. +void createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, + Value *DstAddr, ConstantInt *CopyLen, + unsigned SrcAlign, unsigned DestAlign, + bool SrcIsVolatile, bool DstIsVolatile, + const TargetTransformInfo &TTI); + + /// Expand \p MemCpy as a loop. \p MemCpy is not deleted. -void expandMemCpyAsLoop(MemCpyInst *MemCpy); +void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI); /// Expand \p MemMove as a loop. \p MemMove is not deleted. void expandMemMoveAsLoop(MemMoveInst *MemMove); Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp +++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp @@ -16,6 +16,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include @@ -23,6 +24,11 @@ #define DEBUG_TYPE "tti" +static cl::opt UseWideMemcpyLoopLowering( + "use-wide-memcpy-loop-lowering", cl::init(false), + cl::desc("Enables the new wide memcpy loop lowering in Transforms/Utils."), + cl::Hidden); + namespace { /// \brief No-op implementation of the TTI interface using the utility base /// classes. @@ -482,6 +488,25 @@ return TTIImpl->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType); } +Type *TargetTransformInfo::getMemcpyLoopLoweringType(LLVMContext &Context, + Value *Length, + unsigned SrcAlign, + unsigned DestAlign) const { + return TTIImpl->getMemcpyLoopLoweringType(Context, Length, SrcAlign, + DestAlign); +} + +void TargetTransformInfo::getMemcpyLoopResidualLoweringType( + SmallVectorImpl &OpsOut, LLVMContext &Context, + unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const { + TTIImpl->getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes, + SrcAlign, DestAlign); +} + +bool TargetTransformInfo::useWideIRMemcpyLoopLowering() const { + return UseWideMemcpyLoopLowering; +} + bool TargetTransformInfo::areInlineCompatible(const Function *Caller, const Function *Callee) const { return TTIImpl->areInlineCompatible(Caller, Callee); Index: llvm/trunk/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -10,6 +10,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -34,9 +35,14 @@ AMDGPULowerIntrinsics() : ModulePass(ID) {} bool runOnModule(Module &M) override; + bool expandMemIntrinsicUses(Function &F); StringRef getPassName() const override { return "AMDGPU Lower Intrinsics"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } }; } @@ -55,7 +61,7 @@ return !CI || (CI->getZExtValue() > MaxStaticSize); } -static bool expandMemIntrinsicUses(Function &F) { +bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) { Intrinsic::ID ID = F.getIntrinsicID(); bool Changed = false; @@ -67,7 +73,10 @@ case Intrinsic::memcpy: { auto *Memcpy = cast(Inst); if (shouldExpandOperationWithSize(Memcpy->getLength())) { - expandMemCpyAsLoop(Memcpy); + Function *ParentFunc = Memcpy->getParent()->getParent(); + const TargetTransformInfo &TTI = + getAnalysis().getTTI(*ParentFunc); + expandMemCpyAsLoop(Memcpy, TTI); Changed = true; Memcpy->eraseFromParent(); } Index: llvm/trunk/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "NVPTXLowerAggrCopies.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -42,6 +43,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved(); + AU.addRequired(); } bool runOnFunction(Function &F) override; @@ -61,6 +63,8 @@ const DataLayout &DL = F.getParent()->getDataLayout(); LLVMContext &Context = F.getParent()->getContext(); + const TargetTransformInfo &TTI = + getAnalysis().getTTI(F); // Collect all aggregate loads and mem* calls. for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { @@ -104,15 +108,26 @@ Value *SrcAddr = LI->getOperand(0); Value *DstAddr = SI->getOperand(1); unsigned NumLoads = DL.getTypeStoreSize(LI->getType()); - Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads); + ConstantInt *CopyLen = + ConstantInt::get(Type::getInt32Ty(Context), NumLoads); - createMemCpyLoop(/* ConvertedInst */ SI, - /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, - /* CopyLen */ CopyLen, - /* SrcAlign */ LI->getAlignment(), - /* DestAlign */ SI->getAlignment(), - /* SrcIsVolatile */ LI->isVolatile(), - /* DstIsVolatile */ SI->isVolatile()); + if (!TTI.useWideIRMemcpyLoopLowering()) { + createMemCpyLoop(/* ConvertedInst */ SI, + /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, + /* CopyLen */ CopyLen, + /* SrcAlign */ LI->getAlignment(), + /* DestAlign */ SI->getAlignment(), + /* SrcIsVolatile */ LI->isVolatile(), + /* DstIsVolatile */ SI->isVolatile()); + } else { + createMemCpyLoopKnownSize(/* ConvertedInst */ SI, + /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, + /* CopyLen */ CopyLen, + /* SrcAlign */ LI->getAlignment(), + /* DestAlign */ SI->getAlignment(), + /* SrcIsVolatile */ LI->isVolatile(), + /* DstIsVolatile */ SI->isVolatile(), TTI); + } SI->eraseFromParent(); LI->eraseFromParent(); @@ -121,7 +136,7 @@ // Transform mem* intrinsic calls. for (MemIntrinsic *MemCall : MemCalls) { if (MemCpyInst *Memcpy = dyn_cast(MemCall)) { - expandMemCpyAsLoop(Memcpy); + expandMemCpyAsLoop(Memcpy, TTI); } else if (MemMoveInst *Memmove = dyn_cast(MemCall)) { expandMemMoveAsLoop(Memmove); } else if (MemSetInst *Memset = dyn_cast(MemCall)) { Index: llvm/trunk/lib/Transforms/Utils/LowerMemIntrinsics.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ llvm/trunk/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -8,12 +8,256 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; +static unsigned getLoopOperandSizeInBytes(Type *Type) { + if (VectorType *VTy = dyn_cast(Type)) { + return VTy->getBitWidth() / 8; + } + + return Type->getPrimitiveSizeInBits() / 8; +} + +void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, + Value *DstAddr, ConstantInt *CopyLen, + unsigned SrcAlign, unsigned DestAlign, + bool SrcIsVolatile, bool DstIsVolatile, + const TargetTransformInfo &TTI) { + // No need to expand zero length copies. + if (CopyLen->isZero()) + return; + + BasicBlock *PreLoopBB = InsertBefore->getParent(); + BasicBlock *PostLoopBB = nullptr; + Function *ParentFunc = PreLoopBB->getParent(); + LLVMContext &Ctx = PreLoopBB->getContext(); + + Type *TypeOfCopyLen = CopyLen->getType(); + Type *LoopOpType = + TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign); + + unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType); + uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize; + + unsigned SrcAS = cast(SrcAddr->getType())->getAddressSpace(); + unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); + + if (LoopEndCount != 0) { + // Split + PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split"); + BasicBlock *LoopBB = + BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB); + PreLoopBB->getTerminator()->setSuccessor(0, LoopBB); + + IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); + + // Cast the Src and Dst pointers to pointers to the loop operand type (if + // needed). + PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS); + PointerType *DstOpType = PointerType::get(LoopOpType, DstAS); + if (SrcAddr->getType() != SrcOpType) { + SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType); + } + if (DstAddr->getType() != DstOpType) { + DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType); + } + + IRBuilder<> LoopBuilder(LoopBB); + PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index"); + LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB); + // Loop Body + Value *SrcGEP = + LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); + Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile); + Value *DstGEP = + LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); + LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + + Value *NewIndex = + LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U)); + LoopIndex->addIncoming(NewIndex, LoopBB); + + // Create the loop branch condition. + Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount); + LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI), + LoopBB, PostLoopBB); + } + + uint64_t BytesCopied = LoopEndCount * LoopOpSize; + uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied; + if (RemainingBytes) { + IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI() + : InsertBefore); + + // Update the alignment based on the copy size used in the loop body. + SrcAlign = std::min(SrcAlign, LoopOpSize); + DestAlign = std::min(DestAlign, LoopOpSize); + + SmallVector RemainingOps; + TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, + SrcAlign, DestAlign); + + for (auto OpTy : RemainingOps) { + // Calaculate the new index + unsigned OperandSize = getLoopOperandSizeInBytes(OpTy); + uint64_t GepIndex = BytesCopied / OperandSize; + assert(GepIndex * OperandSize == BytesCopied && + "Division should have no Remainder!"); + // Cast source to operand type and load + PointerType *SrcPtrType = PointerType::get(OpTy, SrcAS); + Value *CastedSrc = SrcAddr->getType() == SrcPtrType + ? SrcAddr + : RBuilder.CreateBitCast(SrcAddr, SrcPtrType); + Value *SrcGEP = RBuilder.CreateInBoundsGEP( + OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex)); + Value *Load = RBuilder.CreateLoad(SrcGEP, SrcIsVolatile); + + // Cast destination to operand type and store. + PointerType *DstPtrType = PointerType::get(OpTy, DstAS); + Value *CastedDst = DstAddr->getType() == DstPtrType + ? DstAddr + : RBuilder.CreateBitCast(DstAddr, DstPtrType); + Value *DstGEP = RBuilder.CreateInBoundsGEP( + OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex)); + RBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + + BytesCopied += OperandSize; + } + } + assert(BytesCopied == CopyLen->getZExtValue() && + "Bytes copied should match size in the call!"); +} + +void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, + Value *SrcAddr, Value *DstAddr, + Value *CopyLen, unsigned SrcAlign, + unsigned DestAlign, bool SrcIsVolatile, + bool DstIsVolatile, + const TargetTransformInfo &TTI) { + BasicBlock *PreLoopBB = InsertBefore->getParent(); + BasicBlock *PostLoopBB = + PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion"); + + Function *ParentFunc = PreLoopBB->getParent(); + LLVMContext &Ctx = PreLoopBB->getContext(); + + Type *LoopOpType = + TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign); + unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType); + + IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); + + unsigned SrcAS = cast(SrcAddr->getType())->getAddressSpace(); + unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); + PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS); + PointerType *DstOpType = PointerType::get(LoopOpType, DstAS); + if (SrcAddr->getType() != SrcOpType) { + SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType); + } + if (DstAddr->getType() != DstOpType) { + DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType); + } + + // Calculate the loop trip count, and remaining bytes to copy after the loop. + Type *CopyLenType = CopyLen->getType(); + IntegerType *ILengthType = dyn_cast(CopyLenType); + assert(ILengthType && + "expected size argument to memcpy to be an integer type!"); + ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize); + Value *RuntimeLoopCount = PLBuilder.CreateUDiv(CopyLen, CILoopOpSize); + Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize); + Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual); + + BasicBlock *LoopBB = + BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, nullptr); + IRBuilder<> LoopBuilder(LoopBB); + + PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index"); + LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB); + + Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); + Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile); + Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); + LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + + Value *NewIndex = + LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U)); + LoopIndex->addIncoming(NewIndex, LoopBB); + + Type *Int8Type = Type::getInt8Ty(Ctx); + if (LoopOpType != Int8Type) { + // Loop body for the residual copy. + BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual", + PreLoopBB->getParent(), nullptr); + // Residual loop header. + BasicBlock *ResHeaderBB = BasicBlock::Create( + Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr); + + // Need to update the pre-loop basic block to branch to the correct place. + // branch to the main loop if the count is non-zero, branch to the residual + // loop if the copy size is smaller then 1 iteration of the main loop but + // non-zero and finally branch to after the residual loop if the memcpy + // size is zero. + ConstantInt *Zero = ConstantInt::get(ILengthType, 0U); + PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero), + LoopBB, ResHeaderBB); + PreLoopBB->getTerminator()->eraseFromParent(); + + LoopBuilder.CreateCondBr( + LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB, + ResHeaderBB); + + // Determine if we need to branch to the residual loop or bypass it. + IRBuilder<> RHBuilder(ResHeaderBB); + RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero), + ResLoopBB, PostLoopBB); + + // Copy the residual with single byte load/store loop. + IRBuilder<> ResBuilder(ResLoopBB); + PHINode *ResidualIndex = + ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index"); + ResidualIndex->addIncoming(Zero, ResHeaderBB); + + Value *SrcAsInt8 = + ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS)); + Value *DstAsInt8 = + ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS)); + Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex); + Value *SrcGEP = + ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset); + Value *Load = ResBuilder.CreateLoad(SrcGEP, SrcIsVolatile); + Value *DstGEP = + ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset); + ResBuilder.CreateStore(Load, DstGEP, DstIsVolatile); + + Value *ResNewIndex = + ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U)); + ResidualIndex->addIncoming(ResNewIndex, ResLoopBB); + + // Create the loop branch condition. + ResBuilder.CreateCondBr( + ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB, + PostLoopBB); + } else { + // In this case the loop operand type was a byte, and there is no need for a + // residual loop to copy the remaining memory after the main loop. + // We do however need to patch up the control flow by creating the + // terminators for the preloop block and the memcpy loop. + ConstantInt *Zero = ConstantInt::get(ILengthType, 0U); + PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero), + LoopBB, PostLoopBB); + PreLoopBB->getTerminator()->eraseFromParent(); + LoopBuilder.CreateCondBr( + LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB, + PostLoopBB); + } +} + void llvm::createMemCpyLoop(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, unsigned SrcAlign, unsigned DestAlign, @@ -208,15 +452,41 @@ NewBB); } -void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy) { - createMemCpyLoop(/* InsertBefore */ Memcpy, - /* SrcAddr */ Memcpy->getRawSource(), - /* DstAddr */ Memcpy->getRawDest(), - /* CopyLen */ Memcpy->getLength(), - /* SrcAlign */ Memcpy->getAlignment(), - /* DestAlign */ Memcpy->getAlignment(), - /* SrcIsVolatile */ Memcpy->isVolatile(), - /* DstIsVolatile */ Memcpy->isVolatile()); +void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy, + const TargetTransformInfo &TTI) { + // Original implementation + if (!TTI.useWideIRMemcpyLoopLowering()) { + createMemCpyLoop(/* InsertBefore */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ Memcpy->getLength(), + /* SrcAlign */ Memcpy->getAlignment(), + /* DestAlign */ Memcpy->getAlignment(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile()); + } else { + if (ConstantInt *CI = dyn_cast(Memcpy->getLength())) { + createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ CI, + /* SrcAlign */ Memcpy->getAlignment(), + /* DestAlign */ Memcpy->getAlignment(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile(), + /* TargetTransformInfo */ TTI); + } else { + createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ Memcpy->getLength(), + /* SrcAlign */ Memcpy->getAlignment(), + /* DestAlign */ Memcpy->getAlignment(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile(), + /* TargetTransfomrInfo */ TTI); + } + } } void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) { Index: llvm/trunk/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ llvm/trunk/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -amdgpu-lower-intrinsics -use-wide-memcpy-loop-lowering=true %s | FileCheck -check-prefix=WOPT %s declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 @@ -21,6 +22,17 @@ ; OPT-NEXT: load i8 ; OPT: getelementptr ; OPT-NEXT: store i8 + +; WOPT-LABEL: @min_size_large_static_memcpy_caller0( +; WOPT-NOT: call +; WOPT: br label %load-store-loop +; WOPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index +; WOPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]] +; WOPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index +; WOPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]] +; WOPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1 +; WOPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025 +; WOPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) ret void Index: llvm/trunk/test/CodeGen/NVPTX/lower-aggr-copies.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ llvm/trunk/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR +; RUN: opt < %s -S -nvptx-lower-aggr-copies -use-wide-memcpy-loop-lowering=true | FileCheck %s --check-prefix WIR ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to ; llvm.mem* intrinsics get lowered to loops. @@ -32,6 +33,23 @@ ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd ; PTX: @%p[[PRED]] bra LBB[[LABEL]] + +; WIR-LABEL: @memcpy_caller +; WIR: entry: +; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1 +; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1 +; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0 +; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; WIR: loop-memcpy-expansion: +; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index +; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] +; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index +; WIR: store i8 [[Load]], i8* [[DstGep]] +; WIR: [[IndexInc]] = add i64 %loop-index, 1 +; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]] +; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion } define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { @@ -50,6 +68,23 @@ ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd ; PTX: @%p[[PRED]] bra LBB[[LABEL]] + +; WIR-LABEL: @memcpy_volatile_caller +; WIR: entry: +; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1 +; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1 +; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0 +; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; WIR: loop-memcpy-expansion: +; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index +; WIR: [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]] +; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index +; WIR: store volatile i8 [[Load]], i8* [[DstGep]] +; WIR: [[IndexInc]] = add i64 %loop-index, 1 +; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]] +; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion } define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 { @@ -65,6 +100,32 @@ ; IR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8* ; IR: getelementptr inbounds i8, i8* [[SRCCAST]] ; IR: getelementptr inbounds i8, i8* [[DSTCAST]] + +; WIR-LABEL: @memcpy_casting_caller +; WIR: [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8* +; WIR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8* +; WIR: getelementptr inbounds i8, i8* [[SRCCAST]] +; WIR: getelementptr inbounds i8, i8* [[DSTCAST]] +} + +define i8* @memcpy_known_size(i8* %dst, i8* %src) { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 144, i32 1, i1 false) + ret i8* %dst + +; Check that calls with compile-time constant size are handled correctly +; WIR-LABEL: @memcpy_known_size +; WIR: entry: +; WIR: br label %load-store-loop +; WIR: load-store-loop: +; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] +; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index +; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] +; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index +; WIR: store i8 [[Load]], i8* [[DstGep]] +; WIR: [[IndexInc]] = add i64 %loop-index, 1 +; WIR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144 +; WIR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split } define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {