Index: include/llvm/IR/IntrinsicInst.h =================================================================== --- include/llvm/IR/IntrinsicInst.h +++ include/llvm/IR/IntrinsicInst.h @@ -152,6 +152,33 @@ } }; + /// This class represents atomic memcpy intrinsic + /// TODO: Integrate this class into MemIntrinsic hierarchy. + class ElementAtomicMemCpyInst : public IntrinsicInst { + public: + Value *getRawDest() const { return getArgOperand(0); } + Value *getRawSource() const { return getArgOperand(1); } + + Value *getNumElements() const { return getArgOperand(2); } + void setNumElements(Value *V) { setArgOperand(2, V); } + + uint64_t getSrcAlignment() const { return getParamAlignment(1); } + uint64_t getDstAlignment() const { return getParamAlignment(2); } + + uint64_t getElementSizeInBytes() const { + Value *Arg = getArgOperand(3); + assert(isa(Arg)); + return cast(Arg)->getZExtValue(); + } + + static inline bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::memcpy_element_atomic; + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + }; + /// This is the common base class for memset/memcpy/memmove. class MemIntrinsic : public IntrinsicInst { public: Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -60,6 +60,13 @@ STATISTIC(NumSimplified, "Number of library calls simplified"); +static cl::opt UnfoldElementAtomicMemcpyMaxElements( + "unfold-element-atomic-memcpy-max-elements", + cl::init(16), + cl::desc("Maximum number of elements in the atomic memcpy to allow " + "optimizer to unfold it into sequence of explicit loads and " + "stores")); + /// Return the specified type promoted as it would be to pass though a va_arg /// area. static Type *getPromotedType(Type *Ty) { @@ -108,6 +115,76 @@ return ConstantVector::get(BoolVec); } +Instruction* +InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) { + // Try to unfold this intrinsic into sequence of explicit atomic loads and + // stores. + // First check that number of elements is compile time constant. + ConstantInt *NumElementsCI = dyn_cast(AMI->getNumElements()); + if (!NumElementsCI) + return nullptr; + + // Check that there are not too many elements. + uint64_t NumElements = NumElementsCI->getZExtValue(); + if (NumElements >= UnfoldElementAtomicMemcpyMaxElements) + return nullptr; + + // Don't unfold into illegal integers + uint64_t ElementSize = AMI->getElementSizeInBytes() * 8; + if (!AMI->getModule()->getDataLayout().isLegalInteger(ElementSize)) + return nullptr; + + // Cast source and destination to the correct type. Intrinsic input arguments + // are usually represented as i8*. + // Often operands will be explicitly casted to i8* and we can just strip + // those casts instead of inserting new ones. However it's easier to rely on + // other InstCombine rules which will cover trivial cases anyway. + Value *Src = AMI->getRawSource(); + Value *Dst = AMI->getRawDest(); + Type *ElementPointerType = Type::getIntNPtrTy( + AMI->getContext(), ElementSize, Src->getType()->getPointerAddressSpace()); + + Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType, + "memcpy_unfold.src_casted"); + Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType, + "memcpy_unfold.dst_casted"); + + for (uint64_t i = 0; i < NumElements; ++i) { + // Get current element addresses + ConstantInt *ElementIdxCI = + ConstantInt::get(AMI->getContext(), APInt(64, i)); + Value *SrcElementAddr = + Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr"); + Value *DstElementAddr = + Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr"); + + // Load from the source. Transfer alignment information and mark load as + // unordered atomic. + LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val"); + Load->setOrdering(AtomicOrdering::Unordered); + // We know alignment of the first element. It is also guaranteed by the + // verifier that element size is less or equal than first element alignment + // and both of this values are powers of two. + // This means that all subsequent accesses are at least element size + // aligned. + Load->setAlignment(i == 0 ? AMI->getSrcAlignment() + : AMI->getElementSizeInBytes()); + Load->setDebugLoc(AMI->getDebugLoc()); + + // Store loaded value via unordered atomic store. + StoreInst *Store = Builder->CreateStore(Load, DstElementAddr); + Store->setOrdering(AtomicOrdering::Unordered); + Store->setAlignment(i == 0 ? AMI->getDstAlignment() + : AMI->getElementSizeInBytes()); + Store->setDebugLoc(AMI->getDebugLoc()); + } + + // Set the number of elements of the copy to 0, it will be deleted on the + // next iteration. + AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType())); + return AMI; +} + Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT); unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT); @@ -1462,6 +1539,15 @@ if (Changed) return II; } + if (auto *AMI = dyn_cast(II)) { + if (Constant *C = dyn_cast(AMI->getNumElements())) + if (C->isNullValue()) + return eraseInstFromFunction(*AMI); + + if (Instruction *I = SimplifyElementAtomicMemCpy(AMI)) + return I; + } + auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth) { APInt UndefElts(Width, 0); Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -638,6 +638,8 @@ Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); Instruction *MatchBSwap(BinaryOperator &I); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); + + Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI); Instruction *SimplifyMemTransfer(MemIntrinsic *MI); Instruction *SimplifyMemSet(MemSetInst *MI); Index: test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll @@ -0,0 +1,92 @@ +; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Test basic unfolding +define void @test1(i8* %Src, i8* %Dst) { +; CHECK-LABEL: test1 +; CHECK-NOT: llvm.memcpy.element.atomic + +; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32* +; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32* + +; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4 +; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8 + +; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4 +; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4 + +; CHECK-DAG: [[VAL3:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4 +; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4 + +; CHECK-DAG: [[VAL4:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4 +; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4 +entry: + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 8 %Src, i64 4, i32 4) + ret void +} + +; Test that we don't unfold too much +define void @test2(i8* %Src, i8* %Dst) { +; CHECK-LABEL: test2 + +; CHECK-NOT: load +; CHECK-NOT: store +; CHECK: llvm.memcpy.element.atomic +entry: + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 1000, i32 4) + ret void +} + +; Test that we will not unfold into non native integers +define void @test3(i8* %Src, i8* %Dst) { +; CHECK-LABEL: test3 + +; CHECK-NOT: load +; CHECK-NOT: store +; CHECK: llvm.memcpy.element.atomic +entry: + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 4, i32 64) + ret void +} + +; Test that we will eliminate redundant bitcasts +define void @test4(i64* %Src, i64* %Dst) { +; CHECK-LABEL: test4 +; CHECK-NOT: llvm.memcpy.element.atomic + +; CHECK-NOT: bitcast + +; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i64, i64* %Src unordered, align 16 +; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16 + +; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1 +; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1 +; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8 +; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8 + +; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2 +; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2 +; CHECK-DAG: [[VAL3:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8 +; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8 + +; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3 +; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3 +; CHECK-DAG: [[VAL4:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8 +; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8 +entry: + %Src.casted = bitcast i64* %Src to i8* + %Dst.casted = bitcast i64* %Dst to i8* + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i64 4, i32 8) + ret void +} + +define void @test5(i8* %Src, i8* %Dst) { +; CHECK-LABEL: test5 + +; CHECK-NOT: llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64) +entry: + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64) + ret void +} + +declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32)