Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -73,19 +73,12 @@ STATISTIC(NumSimplified, "Number of library calls simplified"); -static cl::opt UnfoldElementAtomicMemcpyMaxElements( - "unfold-element-atomic-memcpy-max-elements", - cl::init(16), - cl::desc("Maximum number of elements in atomic memcpy the optimizer is " - "allowed to unfold")); - static cl::opt GuardWideningWindow( "instcombine-guard-widening-window", cl::init(3), cl::desc("How wide an instruction window to bypass looking for " "another guard")); - /// Return the specified type promoted as it would be to pass though a va_arg /// area. static Type *getPromotedType(Type *Ty) { @@ -113,84 +106,7 @@ return ConstantVector::get(BoolVec); } -Instruction * -InstCombiner::SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI) { - // Try to unfold this intrinsic into sequence of explicit atomic loads and - // stores. - // First check that number of elements is compile time constant. - auto *LengthCI = dyn_cast(AMI->getLength()); - if (!LengthCI) - return nullptr; - - // Check that there are not too many elements. - uint64_t LengthInBytes = LengthCI->getZExtValue(); - uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes(); - uint64_t NumElements = LengthInBytes / ElementSizeInBytes; - if (NumElements >= UnfoldElementAtomicMemcpyMaxElements) - return nullptr; - - // Only expand if there are elements to copy. - if (NumElements > 0) { - // Don't unfold into illegal integers - uint64_t ElementSizeInBits = ElementSizeInBytes * 8; - if (!getDataLayout().isLegalInteger(ElementSizeInBits)) - return nullptr; - - // Cast source and destination to the correct type. Intrinsic input - // arguments are usually represented as i8*. Often operands will be - // explicitly casted to i8* and we can just strip those casts instead of - // inserting new ones. However it's easier to rely on other InstCombine - // rules which will cover trivial cases anyway. - Value *Src = AMI->getRawSource(); - Value *Dst = AMI->getRawDest(); - Type *ElementPointerType = - Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits, - Src->getType()->getPointerAddressSpace()); - - Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType, - "memcpy_unfold.src_casted"); - Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType, - "memcpy_unfold.dst_casted"); - - for (uint64_t i = 0; i < NumElements; ++i) { - // Get current element addresses - ConstantInt *ElementIdxCI = - ConstantInt::get(AMI->getContext(), APInt(64, i)); - Value *SrcElementAddr = - Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr"); - Value *DstElementAddr = - Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr"); - - // Load from the source. Transfer alignment information and mark load as - // unordered atomic. - LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val"); - Load->setOrdering(AtomicOrdering::Unordered); - // We know alignment of the first element. It is also guaranteed by the - // verifier that element size is less or equal than first element - // alignment and both of this values are powers of two. This means that - // all subsequent accesses are at least element size aligned. - // TODO: We can infer better alignment but there is no evidence that this - // will matter. - Load->setAlignment(i == 0 ? AMI->getParamAlignment(1) - : ElementSizeInBytes); - Load->setDebugLoc(AMI->getDebugLoc()); - - // Store loaded value via unordered atomic store. - StoreInst *Store = Builder.CreateStore(Load, DstElementAddr); - Store->setOrdering(AtomicOrdering::Unordered); - Store->setAlignment(i == 0 ? AMI->getParamAlignment(0) - : ElementSizeInBytes); - Store->setDebugLoc(AMI->getDebugLoc()); - } - } - - // Set the number of elements of the copy to 0, it will be deleted on the - // next iteration. - AMI->setLength(Constant::getNullValue(LengthCI->getType())); - return AMI; -} - -Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { +Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT); unsigned CopyDstAlign = MI->getDestAlignment(); if (CopyDstAlign < DstAlign){ @@ -198,17 +114,16 @@ return MI; } - auto* MTI = cast(MI); - unsigned SrcAlign = getKnownAlignment(MTI->getRawSource(), DL, MI, &AC, &DT); - unsigned CopySrcAlign = MTI->getSourceAlignment(); + unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT); + unsigned CopySrcAlign = MI->getSourceAlignment(); if (CopySrcAlign < SrcAlign) { - MTI->setSourceAlignment(SrcAlign); + MI->setSourceAlignment(SrcAlign); return MI; } // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with // load/store. - ConstantInt *MemOpLength = dyn_cast(MI->getArgOperand(2)); + ConstantInt *MemOpLength = dyn_cast(MI->getLength()); if (!MemOpLength) return nullptr; // Source and destination pointer types are always "i8*" for intrinsic. See @@ -250,7 +165,7 @@ Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); - LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile()); + LoadInst *L = Builder.CreateLoad(Src); // Alignment from the mem intrinsic will be better, so use it. L->setAlignment(CopySrcAlign); if (CopyMD) @@ -260,7 +175,7 @@ if (LoopMemParallelMD) L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); - StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile()); + StoreInst *S = Builder.CreateStore(L, Dest); // Alignment from the mem intrinsic will be better, so use it. S->setAlignment(CopyDstAlign); if (CopyMD) @@ -268,8 +183,19 @@ if (LoopMemParallelMD) S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); + if (auto *MT = dyn_cast(MI)) { + // non-atomics can be volatile + L->setVolatile(MT->isVolatile()); + S->setVolatile(MT->isVolatile()); + } + if (isa(MI)) { + // atomics have to be unordered + L->setOrdering(AtomicOrdering::Unordered); + S->setOrdering(AtomicOrdering::Unordered); + } + // Set the size of the copy to 0, it will be deleted on the next iteration. - MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType())); + MI->setLength(Constant::getNullValue(MemOpLength->getType())); return MI; } @@ -1781,7 +1707,7 @@ // Intrinsics cannot occur in an invoke, so handle them here instead of in // visitCallSite. - if (MemIntrinsic *MI = dyn_cast(II)) { + if (auto *MI = dyn_cast(II)) { bool Changed = false; // memmove/cpy/set of zero bytes is a noop. @@ -1798,17 +1724,21 @@ } // No other transformations apply to volatile transfers. - if (MI->isVolatile()) - return nullptr; + if (auto *M = dyn_cast(MI)) + if (M->isVolatile()) + return nullptr; // If we have a memmove and the source operation is a constant global, // then the source and dest pointers can't alias, so we can change this // into a call to memcpy. - if (MemMoveInst *MMI = dyn_cast(MI)) { + if (auto *MMI = dyn_cast(MI)) { if (GlobalVariable *GVSrc = dyn_cast(MMI->getSource())) if (GVSrc->isConstant()) { Module *M = CI.getModule(); - Intrinsic::ID MemCpyID = Intrinsic::memcpy; + Intrinsic::ID MemCpyID = + isa(MMI) + ? Intrinsic::memcpy_element_unordered_atomic + : Intrinsic::memcpy; Type *Tys[3] = { CI.getArgOperand(0)->getType(), CI.getArgOperand(1)->getType(), CI.getArgOperand(2)->getType() }; @@ -1817,7 +1747,7 @@ } } - if (MemTransferInst *MTI = dyn_cast(MI)) { + if (AnyMemTransferInst *MTI = dyn_cast(MI)) { // memmove(x,x,size) -> noop. if (MTI->getSource() == MTI->getDest()) return eraseInstFromFunction(CI); @@ -1825,8 +1755,8 @@ // If we can determine a pointer alignment that is bigger than currently // set, update the alignment. - if (isa(MI)) { - if (Instruction *I = SimplifyMemTransfer(MI)) + if (auto *MTI = dyn_cast(MI)) { + if (Instruction *I = SimplifyAnyMemTransfer(MTI)) return I; } else if (MemSetInst *MSI = dyn_cast(MI)) { if (Instruction *I = SimplifyMemSet(MSI)) @@ -1836,15 +1766,6 @@ if (Changed) return II; } - if (auto *AMI = dyn_cast(II)) { - if (Constant *C = dyn_cast(AMI->getLength())) - if (C->isNullValue()) - return eraseInstFromFunction(*AMI); - - if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI)) - return I; - } - if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) return I; Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h @@ -824,8 +824,7 @@ Instruction *MatchBSwap(BinaryOperator &I); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); - Instruction *SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI); - Instruction *SimplifyMemTransfer(MemIntrinsic *MI); + Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI); Instruction *SimplifyMemSet(MemSetInst *MI); Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned); Index: llvm/trunk/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll +++ llvm/trunk/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll @@ -1,94 +0,0 @@ -; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s -; Temporarily an expected failure until inst combine is updated in the next patch -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; Test basic unfolding -- unordered load & store -define void @test1a(i8* %Src, i8* %Dst) { -; CHECK-LABEL: test1a -; CHECK-NOT: llvm.memcpy.element.unordered.atomic - -; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32* -; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32* - -; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4 -; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8 - -; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4 -; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4 - -; CHECK-DAG: [[VAL3:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4 -; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4 - -; CHECK-DAG: [[VAL4:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4 -; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4 -entry: - call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %Dst, i8* align 4 %Src, i32 16, i32 4) - ret void -} - -; Test that we don't unfold too much -define void @test2(i8* %Src, i8* %Dst) { -; CHECK-LABEL: test2 - -; CHECK-NOT: load -; CHECK-NOT: store -; CHECK: llvm.memcpy.element.unordered.atomic -entry: - call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %Dst, i8* align 4 %Src, i32 256, i32 4) - ret void -} - -; Test that we will not unfold into non native integers -define void @test3(i8* %Src, i8* %Dst) { -; CHECK-LABEL: test3 - -; CHECK-NOT: load -; CHECK-NOT: store -; CHECK: llvm.memcpy.element.unordered.atomic -entry: - call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 64, i32 64) - ret void -} - -; Test that we will eliminate redundant bitcasts -define void @test4(i64* %Src, i64* %Dst) { -; CHECK-LABEL: test4 -; CHECK-NOT: llvm.memcpy.element.unordered.atomic - -; CHECK-NOT: bitcast - -; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i64, i64* %Src unordered, align 16 -; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16 - -; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1 -; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1 -; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8 -; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8 - -; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2 -; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2 -; CHECK-DAG: [[VAL3:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8 -; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8 - -; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3 -; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3 -; CHECK-DAG: [[VAL4:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8 -; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8 -entry: - %Src.casted = bitcast i64* %Src to i8* - %Dst.casted = bitcast i64* %Dst to i8* - call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i32 32, i32 8) - ret void -} - -; Test that 0-length unordered atomic memcpy gets removed. -define void @test5(i8* %Src, i8* %Dst) { -; CHECK-LABEL: test5 - -; CHECK-NOT: llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 0, i32 8) -entry: - call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 0, i32 8) - ret void -} - -declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind Index: llvm/trunk/test/Transforms/InstCombine/element-atomic-memintrins.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/element-atomic-memintrins.ll +++ llvm/trunk/test/Transforms/InstCombine/element-atomic-memintrins.ll @@ -1,33 +1,32 @@ -;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have -;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to -;; verify that inst combine handles these intrinsics properly once they have been -;; added to that class hierarchy. - +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -instcombine -S < %s | FileCheck %s ;; ---- memset ----- -; Ensure 0-length memset isn't removed +; Ensure 0-length memset is removed define void @test_memset_zero_length(i8* %dest) { - ; CHECK-LABEL: test_memset_zero_length - ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1) - ; CHECK-NEXT: ret void +; CHECK-LABEL: @test_memset_zero_length( +; CHECK-NEXT: ret void +; call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1) ret void } -; Ensure that small-sized memsets don't convert to stores +; Placeholder test. This will chance once support for lowering atomic memsets is added to instcombine. define void @test_memset_to_store(i8* %dest) { - ; CHECK-LABEL: test_memset_to_store - ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1) - ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1) - ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1) - ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1) - ; CHECK-NEXT: ret void +; CHECK-LABEL: @test_memset_to_store( +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST:%.*]], i8 1, i32 1, i32 1) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 2, i32 1) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 4, i32 1) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 8, i32 1) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 16, i32 1) +; CHECK-NEXT: ret void +; call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1) call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1) call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1) call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1) + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 16, i32 1) ret void } @@ -37,41 +36,35 @@ ;; ========================================= ;; ----- memmove ------ -; memmove from a global constant source does not become memcpy -@gconst = constant [8 x i8] c"0123456\00" + +@gconst = constant [32 x i8] c"0123456789012345678901234567890\00" +; Check that a memmove from a global constant is converted into a memcpy define void @test_memmove_to_memcpy(i8* %dest) { - ; CHECK-LABEL: test_memmove_to_memcpy - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1) - ; CHECK-NEXT: ret void - call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1) +; CHECK-LABEL: @test_memmove_to_memcpy( +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST:%.*]], i8* align 16 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1) +; CHECK-NEXT: ret void +; + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1) ret void } define void @test_memmove_zero_length(i8* %dest, i8* %src) { - ; CHECK-LABEL: test_memmove_zero_length - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16) - ; CHECK-NEXT: ret void +; CHECK-LABEL: @test_memmove_zero_length( +; CHECK-NEXT: ret void +; call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1) call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2) call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4) call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8) call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16) - ret void + ret void } ; memmove with src==dest is removed define void @test_memmove_removed(i8* %srcdest, i32 %sz) { - ; CHECK-LABEL: test_memmove_removed - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16) - ; CHECK-NEXT: ret void +; CHECK-LABEL: @test_memmove_removed( +; CHECK-NEXT: ret void +; call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1) call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2) call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4) @@ -82,17 +75,220 @@ ; memmove with a small constant length is converted to a load/store pair define void @test_memmove_loadstore(i8* %dest, i8* %src) { - ; CHECK-LABEL: test_memmove_loadstore - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1) - ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1) - ; CHECK-NEXT: ret void +; CHECK-LABEL: @test_memmove_loadstore( +; CHECK-NEXT: [[TMP1:%.*]] = load atomic i8, i8* [[SRC:%.*]] unordered, align 1 +; CHECK-NEXT: store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to i16* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[DEST]] to i16* +; CHECK-NEXT: [[TMP4:%.*]] = load atomic i16, i16* [[TMP2]] unordered, align 1 +; CHECK-NEXT: store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[SRC]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[DEST]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = load atomic i32, i32* [[TMP5]] unordered, align 1 +; CHECK-NEXT: store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[SRC]] to i64* +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64* +; CHECK-NEXT: [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1 +; CHECK-NEXT: store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1 +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1) +; CHECK-NEXT: ret void +; call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1) call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1) call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1) call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 16, i32 1) + ret void +} + +define void @test_memmove_loadstore_2(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memmove_loadstore_2( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i16* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i16* +; CHECK-NEXT: [[TMP3:%.*]] = load atomic i16, i16* [[TMP1]] unordered, align 2 +; CHECK-NEXT: store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i32* +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = load atomic i32, i32* [[TMP4]] unordered, align 2 +; CHECK-NEXT: store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[SRC]] to i64* +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[DEST]] to i64* +; CHECK-NEXT: [[TMP9:%.*]] = load atomic i64, i64* [[TMP7]] unordered, align 2 +; CHECK-NEXT: store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2 +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2) +; CHECK-NEXT: ret void +; + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 2, i32 2) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 4, i32 2) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 8, i32 2) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 16, i32 2) + ret void +} + +define void @test_memmove_loadstore_4(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memmove_loadstore_4( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i32* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4 +; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i64* +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i64* +; CHECK-NEXT: [[TMP6:%.*]] = load atomic i64, i64* [[TMP4]] unordered, align 4 +; CHECK-NEXT: store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4 +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4) +; CHECK-NEXT: ret void +; + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 4, i32 4) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 8, i32 4) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 16, i32 4) + ret void +} + +define void @test_memmove_loadstore_8(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memmove_loadstore_8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i64* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i64* +; CHECK-NEXT: [[TMP3:%.*]] = load atomic i64, i64* [[TMP1]] unordered, align 8 +; CHECK-NEXT: store atomic i64 [[TMP3]], i64* [[TMP2]] unordered, align 8 +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i32 8) +; CHECK-NEXT: ret void +; + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 8, i32 8) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 16, i32 8) + ret void +} + +define void @test_memmove_loadstore_16(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memmove_loadstore_16( +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[DEST:%.*]], i8* align 16 [[SRC:%.*]], i32 16, i32 16) +; CHECK-NEXT: ret void +; + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 16, i32 16) ret void } declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly + +;; ========================================= +;; ----- memcpy ------ + +define void @test_memcpy_zero_length(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memcpy_zero_length( +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16) + ret void +} + +; memcpy with src==dest is removed +define void @test_memcpy_removed(i8* %srcdest, i32 %sz) { +; CHECK-LABEL: @test_memcpy_removed( +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16) + ret void +} + +; memcpy with a small constant length is converted to a load/store pair +define void @test_memcpy_loadstore(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memcpy_loadstore( +; CHECK-NEXT: [[TMP1:%.*]] = load atomic i8, i8* [[SRC:%.*]] unordered, align 1 +; CHECK-NEXT: store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to i16* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[DEST]] to i16* +; CHECK-NEXT: [[TMP4:%.*]] = load atomic i16, i16* [[TMP2]] unordered, align 1 +; CHECK-NEXT: store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[SRC]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[DEST]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = load atomic i32, i32* [[TMP5]] unordered, align 1 +; CHECK-NEXT: store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[SRC]] to i64* +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64* +; CHECK-NEXT: [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1 +; CHECK-NEXT: store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1 +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 16, i32 1) + ret void +} + +define void @test_memcpy_loadstore_2(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memcpy_loadstore_2( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i16* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i16* +; CHECK-NEXT: [[TMP3:%.*]] = load atomic i16, i16* [[TMP1]] unordered, align 2 +; CHECK-NEXT: store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i32* +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = load atomic i32, i32* [[TMP4]] unordered, align 2 +; CHECK-NEXT: store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[SRC]] to i64* +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[DEST]] to i64* +; CHECK-NEXT: [[TMP9:%.*]] = load atomic i64, i64* [[TMP7]] unordered, align 2 +; CHECK-NEXT: store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2 +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 2, i32 2) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 4, i32 2) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 8, i32 2) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 16, i32 2) + ret void +} + +define void @test_memcpy_loadstore_4(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memcpy_loadstore_4( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i32* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4 +; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i64* +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i64* +; CHECK-NEXT: [[TMP6:%.*]] = load atomic i64, i64* [[TMP4]] unordered, align 4 +; CHECK-NEXT: store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4 +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 4, i32 4) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 8, i32 4) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 16, i32 4) + ret void +} + +define void @test_memcpy_loadstore_8(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memcpy_loadstore_8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i64* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i64* +; CHECK-NEXT: [[TMP3:%.*]] = load atomic i64, i64* [[TMP1]] unordered, align 8 +; CHECK-NEXT: store atomic i64 [[TMP3]], i64* [[TMP2]] unordered, align 8 +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i32 8) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 8, i32 8) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 16, i32 8) + ret void +} + +define void @test_memcpy_loadstore_16(i8* %dest, i8* %src) { +; CHECK-LABEL: @test_memcpy_loadstore_16( +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[DEST:%.*]], i8* align 16 [[SRC:%.*]], i32 16, i32 16) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 16, i32 16) + ret void +} + +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly