diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1283,9 +1283,11 @@ Type *ExpectedType) const; /// \returns The type to use in a loop expansion of a memcpy call. - Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, - unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const; + Type * + getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAddrSpace, unsigned DestAddrSpace, + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicElementSize = None) const; /// \param[out] OpsOut The operand types to copy RemainingBytes of memory. /// \param RemainingBytes The number of bytes to copy. @@ -1296,7 +1298,8 @@ void getMemcpyLoopResidualLoweringType( SmallVectorImpl &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const; + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicCpySize = None) const; /// \returns True if the two functions have compatible attributes for inlining /// purposes. @@ -1734,15 +1737,17 @@ virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0; virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType) = 0; - virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, - unsigned SrcAddrSpace, - unsigned DestAddrSpace, - unsigned SrcAlign, - unsigned DestAlign) const = 0; + virtual Type * + getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAddrSpace, unsigned DestAddrSpace, + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicElementSize) const = 0; + virtual void getMemcpyLoopResidualLoweringType( SmallVectorImpl &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const = 0; + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicCpySize) const = 0; virtual bool areInlineCompatible(const Function *Caller, const Function *Callee) const = 0; virtual bool areTypesABICompatible(const Function *Caller, @@ -2298,20 +2303,22 @@ Type *ExpectedType) override { return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType); } - Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, - unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, - unsigned DestAlign) const override { + Type *getMemcpyLoopLoweringType( + LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, + unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, + Optional AtomicElementSize) const override { return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace, - DestAddrSpace, SrcAlign, DestAlign); + DestAddrSpace, SrcAlign, DestAlign, + AtomicElementSize); } void getMemcpyLoopResidualLoweringType( SmallVectorImpl &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const override { + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicCpySize) const override { Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, - SrcAlign, DestAlign); + SrcAlign, DestAlign, AtomicCpySize); } bool areInlineCompatible(const Function *Caller, const Function *Callee) const override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -698,16 +698,21 @@ Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const { - return Type::getInt8Ty(Context); + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicElementSize) const { + return AtomicElementSize ? Type::getIntNTy(Context, *AtomicElementSize * 8) + : Type::getInt8Ty(Context); } void getMemcpyLoopResidualLoweringType( SmallVectorImpl &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const { - for (unsigned i = 0; i != RemainingBytes; ++i) - OpsOut.push_back(Type::getInt8Ty(Context)); + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicCpySize) const { + unsigned OpSizeInBytes = AtomicCpySize ? *AtomicCpySize : 1; + Type *OpType = Type::getIntNTy(Context, OpSizeInBytes * 8); + for (unsigned i = 0; i != RemainingBytes; i+= OpSizeInBytes) + OpsOut.push_back(OpType); } bool areInlineCompatible(const Function *Caller, diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h --- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h +++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h @@ -14,8 +14,11 @@ #ifndef LLVM_TRANSFORMS_UTILS_LOWERMEMINTRINSICS_H #define LLVM_TRANSFORMS_UTILS_LOWERMEMINTRINSICS_H +#include "llvm/ADT/Optional.h" + namespace llvm { +class AtomicMemCpyInst; class ConstantInt; class Instruction; class MemCpyInst; @@ -32,7 +35,8 @@ Value *DstAddr, Value *CopyLen, Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, - const TargetTransformInfo &TTI); + const TargetTransformInfo &TTI, + Optional AtomicSize = None); /// Emit a loop implementing the semantics of an llvm.memcpy whose size is a /// compile time constant. Loop is inserted at \p InsertBefore. @@ -40,7 +44,8 @@ Value *DstAddr, ConstantInt *CopyLen, Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, - bool CanOverlap, const TargetTransformInfo &TTI); + bool CanOverlap, const TargetTransformInfo &TTI, + Optional AtomicCpySize = None); /// Expand \p MemCpy as a loop. \p MemCpy is not deleted. void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI, @@ -52,6 +57,11 @@ /// Expand \p MemSet as a loop. \p MemSet is not deleted. void expandMemSetAsLoop(MemSetInst *MemSet); +/// Expand \p AtomicMemCpy as a loop. \p AtomicMemCpy is not deleted. +void expandAtomicMemCpyAsLoop(AtomicMemCpyInst *AtomicMemCpy, + const TargetTransformInfo &TTI, + ScalarEvolution *SE); + } // End llvm namespace #endif diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -973,18 +973,21 @@ Type *TargetTransformInfo::getMemcpyLoopLoweringType( LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, - unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const { + unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, + Optional AtomicElementSize) const { return TTIImpl->getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace, - DestAddrSpace, SrcAlign, DestAlign); + DestAddrSpace, SrcAlign, DestAlign, + AtomicElementSize); } void TargetTransformInfo::getMemcpyLoopResidualLoweringType( SmallVectorImpl &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const { - TTIImpl->getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes, - SrcAddrSpace, DestAddrSpace, - SrcAlign, DestAlign); + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicCpySize) const { + TTIImpl->getMemcpyLoopResidualLoweringType( + OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, + DestAlign, AtomicCpySize); } bool TargetTransformInfo::areInlineCompatible(const Function *Caller, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -135,15 +135,14 @@ unsigned AddrSpace) const; Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const; - - void getMemcpyLoopResidualLoweringType(SmallVectorImpl &OpsOut, - LLVMContext &Context, - unsigned RemainingBytes, - unsigned SrcAddrSpace, - unsigned DestAddrSpace, - unsigned SrcAlign, - unsigned DestAlign) const; + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicElementSize) const; + + void getMemcpyLoopResidualLoweringType( + SmallVectorImpl &OpsOut, LLVMContext &Context, + unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, + unsigned SrcAlign, unsigned DestAlign, + Optional AtomicCpySize) const; unsigned getMaxInterleaveFactor(unsigned VF); bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -410,11 +410,14 @@ // unaligned access is legal? // // FIXME: This could use fine tuning and microbenchmarks. -Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, - unsigned SrcAddrSpace, - unsigned DestAddrSpace, - unsigned SrcAlign, - unsigned DestAlign) const { +Type *GCNTTIImpl::getMemcpyLoopLoweringType( + LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, + unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, + Optional AtomicElementSize) const { + + if (AtomicElementSize) + return Type::getIntNTy(Context, *AtomicElementSize * 8); + unsigned MinAlign = std::min(SrcAlign, DestAlign); // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the @@ -441,9 +444,14 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType( SmallVectorImpl &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const { + unsigned SrcAlign, unsigned DestAlign, Optional AtomicCpySize) const { assert(RemainingBytes < 16); + if (AtomicCpySize) + BaseT::getMemcpyLoopResidualLoweringType( + OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, + DestAlign, AtomicCpySize); + unsigned MinAlign = std::min(SrcAlign, DestAlign); if (MinAlign != 2) { diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -21,7 +21,8 @@ Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + Optional AtomicElementSize) { // No need to expand zero length copies. if (CopyLen->isZero()) return; @@ -41,9 +42,15 @@ Type *TypeOfCopyLen = CopyLen->getType(); Type *LoopOpType = TTI.getMemcpyLoopLoweringType( - Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value()); + Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value(), + AtomicElementSize); + assert(!LoopOpType->isVectorTy() && + "Atomic memcpy lowering is not supported for vector operand type"); unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); + assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) && + "Atomic memcpy lowering is not supported for selected operand size"); + uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize; if (LoopEndCount != 0) { @@ -90,6 +97,11 @@ // Indicate that stores don't overlap loads. Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); + if (AtomicElementSize) { + Load->setAtomic(AtomicOrdering::Unordered); + Store->setAtomic(AtomicOrdering::Unordered); + } + Value *NewIndex = LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U)); LoopIndex->addIncoming(NewIndex, LoopBB); @@ -109,7 +121,7 @@ SmallVector RemainingOps; TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, SrcAS, DstAS, SrcAlign.value(), - DstAlign.value()); + DstAlign.value(), AtomicElementSize); for (auto OpTy : RemainingOps) { Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied)); @@ -117,6 +129,9 @@ // Calaculate the new index unsigned OperandSize = DL.getTypeStoreSize(OpTy); + assert((!AtomicElementSize || OperandSize % *AtomicElementSize == 0) && + "Atomic memcpy lowering is not supported for selected operand size"); + uint64_t GepIndex = BytesCopied / OperandSize; assert(GepIndex * OperandSize == BytesCopied && "Division should have no Remainder!"); @@ -147,6 +162,11 @@ // Indicate that stores don't overlap loads. Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); + if (AtomicElementSize) { + Load->setAtomic(AtomicOrdering::Unordered); + Store->setAtomic(AtomicOrdering::Unordered); + } + BytesCopied += OperandSize; } } @@ -159,7 +179,8 @@ Value *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + Optional AtomicElementSize) { BasicBlock *PreLoopBB = InsertBefore->getParent(); BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion"); @@ -176,8 +197,13 @@ unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); Type *LoopOpType = TTI.getMemcpyLoopLoweringType( - Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value()); + Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value(), + AtomicElementSize); + assert(!LoopOpType->isVectorTy() && + "Atomic memcpy lowering is not supported for vector operand type"); unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); + assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) && + "Atomic memcpy lowering is not supported for selected operand size"); IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); @@ -225,14 +251,27 @@ // Indicate that stores don't overlap loads. Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); + if (AtomicElementSize) { + Load->setAtomic(AtomicOrdering::Unordered); + Store->setAtomic(AtomicOrdering::Unordered); + } + Value *NewIndex = LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U)); LoopIndex->addIncoming(NewIndex, LoopBB); - if (!LoopOpIsInt8) { - // Add in the - Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize); - Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual); + bool requiresResidal = + !LoopOpIsInt8 && !(AtomicElementSize && LoopOpSize == AtomicElementSize); + if (requiresResidal) { + Type *ResLoopOpType = + AtomicElementSize ? Type::getIntNTy(Ctx, *AtomicElementSize * 8) : Int8Type; + unsigned ResLoopOpSize = DL.getTypeStoreSize(ResLoopOpType); + assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) && + "Store size is expected to match type size"); + + // Add in the + Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize); + Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual); // Loop body for the residual copy. BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual", @@ -267,30 +306,35 @@ ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index"); ResidualIndex->addIncoming(Zero, ResHeaderBB); - Value *SrcAsInt8 = - ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS)); - Value *DstAsInt8 = - ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS)); + Value *SrcAsResLoopOpType = ResBuilder.CreateBitCast( + SrcAddr, PointerType::get(ResLoopOpType, SrcAS)); + Value *DstAsResLoopOpType = ResBuilder.CreateBitCast( + DstAddr, PointerType::get(ResLoopOpType, DstAS)); Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex); - Value *SrcGEP = - ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset); - LoadInst *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, + Value *SrcGEP = ResBuilder.CreateInBoundsGEP( + ResLoopOpType, SrcAsResLoopOpType, FullOffset); + LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile); if (!CanOverlap) // Set alias scope for loads. Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope)); - Value *DstGEP = - ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset); + Value *DstGEP = ResBuilder.CreateInBoundsGEP( + ResLoopOpType, DstAsResLoopOpType, FullOffset); StoreInst *Store = ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile); if (!CanOverlap) // Indicate that stores don't overlap loads. Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); - Value *ResNewIndex = - ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U)); + if (AtomicElementSize) { + Load->setAtomic(AtomicOrdering::Unordered); + Store->setAtomic(AtomicOrdering::Unordered); + } + + Value *ResNewIndex = ResBuilder.CreateAdd( + ResidualIndex, ConstantInt::get(CopyLenType, ResLoopOpSize)); ResidualIndex->addIncoming(ResNewIndex, ResLoopBB); // Create the loop branch condition. @@ -465,17 +509,21 @@ NewBB); } -void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy, - const TargetTransformInfo &TTI, - ScalarEvolution *SE) { - bool CanOverlap = true; +template +static bool canOverlap(MemTransferBase *Memcpy, ScalarEvolution *SE) { if (SE) { auto *SrcSCEV = SE->getSCEV(Memcpy->getRawSource()); auto *DestSCEV = SE->getSCEV(Memcpy->getRawDest()); if (SE->isKnownPredicateAt(CmpInst::ICMP_NE, SrcSCEV, DestSCEV, Memcpy)) - CanOverlap = false; + return false; } + return true; +} +void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy, + const TargetTransformInfo &TTI, + ScalarEvolution *SE) { + bool CanOverlap = canOverlap(Memcpy, SE); if (ConstantInt *CI = dyn_cast(Memcpy->getLength())) { createMemCpyLoopKnownSize( /* InsertBefore */ Memcpy, @@ -522,3 +570,36 @@ /* Alignment */ Memset->getDestAlign().valueOrOne(), Memset->isVolatile()); } + +void llvm::expandAtomicMemCpyAsLoop(AtomicMemCpyInst *AtomicMemcpy, + const TargetTransformInfo &TTI, + ScalarEvolution *SE) { + bool CanOverlap = canOverlap(AtomicMemcpy, SE); + if (ConstantInt *CI = dyn_cast(AtomicMemcpy->getLength())) { + createMemCpyLoopKnownSize( + /* InsertBefore */ AtomicMemcpy, + /* SrcAddr */ AtomicMemcpy->getRawSource(), + /* DstAddr */ AtomicMemcpy->getRawDest(), + /* CopyLen */ CI, + /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(), + /* DestAlign */ AtomicMemcpy->getDestAlign().valueOrOne(), + /* SrcIsVolatile */ AtomicMemcpy->isVolatile(), + /* DstIsVolatile */ AtomicMemcpy->isVolatile(), + /* CanOverlap */ CanOverlap, + /* TargetTransformInfo */ TTI, + /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes()); + } else { + createMemCpyLoopUnknownSize( + /* InsertBefore */ AtomicMemcpy, + /* SrcAddr */ AtomicMemcpy->getRawSource(), + /* DstAddr */ AtomicMemcpy->getRawDest(), + /* CopyLen */ AtomicMemcpy->getLength(), + /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(), + /* DestAlign */ AtomicMemcpy->getDestAlign().valueOrOne(), + /* SrcIsVolatile */ AtomicMemcpy->isVolatile(), + /* DstIsVolatile */ AtomicMemcpy->isVolatile(), + /* CanOverlap */ CanOverlap, + /* TargetTransformInfo */ TTI, + /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes()); + } +} diff --git a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp --- a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp +++ b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp @@ -127,4 +127,94 @@ // llvm::setCurrentDebugType("loop-vectorize"); MPM.run(*M, MAM); } + +TEST_F(MemTransferLowerTest, AtomicMemCpyKnownLength) { + ParseAssembly("declare void " + "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32*, " + "i32 *, i64, i32)\n" + "define void @foo(i32* %dst, i32* %src, i64 %n) optsize {\n" + "entry:\n" + " %is_not_equal = icmp ne i32* %dst, %src\n" + " br i1 %is_not_equal, label %memcpy, label %exit\n" + "memcpy:\n" + " call void " + "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32* " + "%dst, i32* %src, " + "i64 1024, i32 4)\n" + " br label %exit\n" + "exit:\n" + " ret void\n" + "}\n"); + + FunctionPassManager FPM; + FPM.addPass(ForwardingPass([=](Function & F, FunctionAnalysisManager & FAM) + ->PreservedAnalyses { + TargetTransformInfo TTI(M->getDataLayout()); + auto *MemCpyBB = getBasicBlockByName(F, "memcpy"); + Instruction *Inst = &MemCpyBB->front(); + assert(isa(Inst) && + "Expecting llvm.memcpy.p0i8.i64 instructon"); + AtomicMemCpyInst *MemCpyI = cast(Inst); + auto &SE = FAM.getResult(F); + expandAtomicMemCpyAsLoop(MemCpyI, TTI, &SE); + return PreservedAnalyses::none(); + })); + FPM.addPass(ForwardingPass([=](Function & F, FunctionAnalysisManager & FAM) + ->PreservedAnalyses { + // F.dump(); + auto *TargetBB = getBasicBlockByName(F, "load-store-loop"); + EXPECT_NE(TargetBB, nullptr); + return PreservedAnalyses::all(); + })); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + + // llvm::DebugFlag = true; + // llvm::setCurrentDebugType("loop-vectorize"); + MPM.run(*M, MAM); +} + +TEST_F(MemTransferLowerTest, AtomicMemCpyUnKnownLength) { + ParseAssembly("declare void " + "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32*, " + "i32 *, i64, i32)\n" + "define void @foo(i32* %dst, i32* %src, i64 %n) optsize {\n" + "entry:\n" + " %is_not_equal = icmp ne i32* %dst, %src\n" + " br i1 %is_not_equal, label %memcpy, label %exit\n" + "memcpy:\n" + " call void " + "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32* " + "%dst, i32* %src, " + "i64 %n, i32 4)\n" + " br label %exit\n" + "exit:\n" + " ret void\n" + "}\n"); + + FunctionPassManager FPM; + FPM.addPass(ForwardingPass([=](Function & F, FunctionAnalysisManager & FAM) + ->PreservedAnalyses { + TargetTransformInfo TTI(M->getDataLayout()); + auto *MemCpyBB = getBasicBlockByName(F, "memcpy"); + Instruction *Inst = &MemCpyBB->front(); + assert(isa(Inst) && + "Expecting llvm.memcpy.p0i8.i64 instructon"); + AtomicMemCpyInst *MemCpyI = cast(Inst); + auto &SE = FAM.getResult(F); + expandAtomicMemCpyAsLoop(MemCpyI, TTI, &SE); + return PreservedAnalyses::none(); + })); + FPM.addPass(ForwardingPass([=](Function & F, FunctionAnalysisManager & FAM) + ->PreservedAnalyses { + // F.dump(); + auto *TargetBB = getBasicBlockByName(F, "loop-memcpy-expansion"); + EXPECT_NE(TargetBB, nullptr); + return PreservedAnalyses::all(); + })); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + + // llvm::DebugFlag = true; + // llvm::setCurrentDebugType("loop-vectorize"); + MPM.run(*M, MAM); +} }