Index: include/llvm/IR/IntrinsicInst.h =================================================================== --- include/llvm/IR/IntrinsicInst.h +++ include/llvm/IR/IntrinsicInst.h @@ -152,6 +152,31 @@ } }; + /// This class handles atomic memcpy intrinsic + /// TODO: Integrate this class into MemIntrinsic hierarchy. + class ElementAtomicMemCpyInst : public IntrinsicInst { + public: + Value *getRawDest() const { return getArgOperand(0); } + Value *getRawSource() const { return getArgOperand(1); } + Value *getNumElements() const { return getArgOperand(2); } + + uint64_t getSrcAlignment() const { return getParamAlignment(1); } + uint64_t getDstAlignment() const { return getParamAlignment(2); } + + uint64_t getElementSizeInBytes() const { + Value *Arg = getArgOperand(3); + assert(isa(Arg)); + return cast(Arg)->getZExtValue(); + } + + static inline bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::memcpy_element_atomic; + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + }; + /// This is the common base class for memset/memcpy/memmove. class MemIntrinsic : public IntrinsicInst { public: Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -60,6 +60,12 @@ STATISTIC(NumSimplified, "Number of library calls simplified"); +static cl::opt UnfoldElementAtomicMemcpyThreshold( + "element-atomic-memcpy-threshold", + cl::init(16), + cl::desc("Threshold to determine when optimizer is allowed to unfold " + "atomic memcpy into sequence of atomic loads and stores")); + /// Return the specified type promoted as it would be to pass though a va_arg /// area. static Type *getPromotedType(Type *Ty) { @@ -108,6 +114,72 @@ return ConstantVector::get(BoolVec); } +/// Try to simplify element atomic memcpy intrinsic. +/// Return true if changes were made and false otherwise. +/// All newly inserted instructions will be added to the worklist. +bool +InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) { + // Try to unfold this intrinsic into sequence of explicit atomic loads and + // stores. + // First check that number of elements is compile time constant. + ConstantInt *NumElementsCI = dyn_cast(AMI->getNumElements()); + if (!NumElementsCI) + return false; + + // Check that there are not too many elements. + uint64_t NumElements = NumElementsCI->getZExtValue(); + if (NumElements >= UnfoldElementAtomicMemcpyThreshold) + return false; + + // Don't unfold into illegal integers + uint64_t ElementSize = AMI->getElementSizeInBytes() * 8; + if (!AMI->getModule()->getDataLayout().isLegalInteger(ElementSize)) + return false; + + // Cast source and destination to the correct type. Intrinsic input arguments + // are usually represented as i8*. + // Often operands will be explicitly casted to i8* and we can just strip + // those casts instead of inserting new ones. However it's easier to rely on + // other InstCombine rules which will cover trivial cases anyway. + Value *Src = AMI->getRawSource(); + Value *Dst = AMI->getRawDest(); + Type *ElementPointerType = Type::getIntNPtrTy( + AMI->getContext(), ElementSize, Src->getType()->getPointerAddressSpace()); + + Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType, + "memcpy_unfold.src_casted"); + Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType, + "memcpy_unfold.dst_casted"); + + for (uint64_t i = 0; i < NumElements; ++i) { + // Get current element addresses + ConstantInt *ElementIdxCI = + ConstantInt::get(AMI->getContext(), APInt(64, i)); + Value *SrcElementAddr = + Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr"); + Value *DstElementAddr = + Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr"); + + // Load from the source. Transfer alignment information and mark load as + // unordered atomic. + LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val"); + Load->setOrdering(AtomicOrdering::Unordered); + Load->setAlignment(AMI->getSrcAlignment()); + Load->setDebugLoc(AMI->getDebugLoc()); + + // Store loaded value via unordered atomic store. + StoreInst *Store = Builder->CreateStore(Load, DstElementAddr); + Store->setOrdering(AtomicOrdering::Unordered); + Store->setAlignment(AMI->getDstAlignment()); + Store->setDebugLoc(AMI->getDebugLoc()); + } + + // Explicitly remove stale intrinsic because we can't DCE it. + eraseInstFromFunction(*AMI); + + return true; +} + Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT); unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT); @@ -1462,6 +1534,13 @@ if (Changed) return II; } + if (auto *AMI = dyn_cast(II)) { + if (SimplifyElementAtomicMemCpy(AMI)) + // SimplifyElementAtomicMemCpy already added all relevant instructions + // to the worklist. + return nullptr; + } + auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth) { APInt UndefElts(Width, 0); Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -638,6 +638,8 @@ Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); Instruction *MatchBSwap(BinaryOperator &I); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); + + bool SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI); Instruction *SimplifyMemTransfer(MemIntrinsic *MI); Instruction *SimplifyMemSet(MemSetInst *MI);