Index: include/llvm/Transforms/Scalar/MemCpyOptimizer.h =================================================================== --- include/llvm/Transforms/Scalar/MemCpyOptimizer.h +++ include/llvm/Transforms/Scalar/MemCpyOptimizer.h @@ -30,9 +30,11 @@ class Function; class Instruction; class MemCpyInst; -class MemMoveInst; +class AnyMemCpyInst; +class AnyMemMoveInst; class MemoryDependenceResults; class MemSetInst; +class AnyMemSetInst; class StoreInst; class TargetLibraryInfo; class Value; @@ -60,13 +62,13 @@ // Helper functions bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); - bool processMemCpy(MemCpyInst *M); - bool processMemMove(MemMoveInst *M); + bool processAnyMemCpy(AnyMemCpyInst *M); + bool processAnyMemMove(AnyMemMoveInst *M); bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, uint64_t cpyLen, unsigned cpyAlign, CallInst *C); - bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep); + bool processMemCpyMemCpyDependence(AnyMemCpyInst *M, AnyMemCpyInst *MDep); bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep); - bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep); + bool performMemCpyToMemSetOptzn(AnyMemCpyInst *M, AnyMemSetInst *MDep); bool processByValArgument(CallSite CS, unsigned ArgNo); Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, Value *ByteVal); Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1005,11 +1005,22 @@ /// We've found that the (upward scanning) memory dependence of memcpy 'M' is /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. -bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, - MemCpyInst *MDep) { +bool MemCpyOptPass::processMemCpyMemCpyDependence(AnyMemCpyInst *M, + AnyMemCpyInst *MDep) { + auto* MI = dyn_cast(M); + auto* AMI = dyn_cast(M); + auto* MIDep = dyn_cast(MDep); + auto* AMIDep = dyn_cast(MDep); + + // Atomicity of the memcpy & dependent memcpy must be the same. + // FIXME: It is probably okay to do this replacement if M is atomic, and + // MDep is not. + if (!((MI && MIDep) || (AMI && AMIDep))) + return false; + // We can only transforms memcpy's where the dest of one is the source of the // other. - if (M->getSource() != MDep->getDest() || MDep->isVolatile()) + if (M->getSource() != MDep->getDest() || (MIDep && MIDep->isVolatile())) return false; // If dep instruction is reading from our current input, then it is a noop @@ -1027,6 +1038,12 @@ if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; + // If we have an atomic memcpy, then the element sizes of M and MDep + // must be the same. + // FIXME: It may be possible to relax this restriction in some way. + if (AMI && AMIDep->getElementSizeInBytes() != AMI->getElementSizeInBytes()) + return false; + AliasAnalysis &AA = LookupAliasAnalysis(); // Verify that the copied-from memory doesn't change in between the two @@ -1060,14 +1077,27 @@ // TODO: Is this worth it if we're creating a less aligned memcpy? For // example we could be moving from movaps -> movq on x86. IRBuilder<> Builder(M); - if (UseMemMove) - Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(), - MDep->getRawSource(), MDep->getSourceAlignment(), - M->getLength(), M->isVolatile()); - else - Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(), - MDep->getRawSource(), MDep->getSourceAlignment(), - M->getLength(), M->isVolatile()); + if (MI) { + // Non-atomic operation + if (UseMemMove) + Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(), + MDep->getRawSource(), MDep->getSourceAlignment(), + M->getLength(), M->isVolatile()); + else + Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(), + MDep->getRawSource(), MDep->getSourceAlignment(), + M->getLength(), M->isVolatile()); + } else { + // Atomic operation + if (UseMemMove) + Builder.CreateElementUnorderedAtomicMemMove(M->getRawDest(), M->getDestAlignment(), + MDep->getRawSource(), MDep->getSourceAlignment(), + M->getLength(), AMI->getElementSizeInBytes()); + else + Builder.CreateElementUnorderedAtomicMemCpy(M->getRawDest(), M->getDestAlignment(), + MDep->getRawSource(), MDep->getSourceAlignment(), + M->getLength(), AMI->getElementSizeInBytes()); + } // Remove the instruction we're replacing. MD->removeInstruction(M); @@ -1155,8 +1185,8 @@ /// When dst2_size <= dst1_size. /// /// The \p MemCpy must have a Constant length. -bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, - MemSetInst *MemSet) { +bool MemCpyOptPass::performMemCpyToMemSetOptzn(AnyMemCpyInst *MemCpy, + AnyMemSetInst *MemSet) { AliasAnalysis &AA = LookupAliasAnalysis(); // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and @@ -1172,8 +1202,13 @@ return false; IRBuilder<> Builder(MemCpy); - Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), - CopySize, MemCpy->getDestAlignment()); + if (auto* AMC = dyn_cast(MemCpy)) + Builder.CreateElementUnorderedAtomicMemSet(MemCpy->getRawDest(), MemSet->getValue(), + CopySize, MemCpy->getDestAlignment(), + AMC->getElementSizeInBytes()); + else + Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), + CopySize, MemCpy->getDestAlignment()); return true; } @@ -1182,9 +1217,12 @@ /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. -bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { +bool MemCpyOptPass::processAnyMemCpy(AnyMemCpyInst *M) { + auto* MI = dyn_cast(M); + auto* AMI = dyn_cast(M); + // We can only optimize non-volatile memcpy's. - if (M->isVolatile()) return false; + if (MI && MI->isVolatile()) return false; // If the source and destination of the memcpy are the same, then zap it. if (M->getSource() == M->getDest()) { @@ -1198,8 +1236,14 @@ if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { IRBuilder<> Builder(M); - Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), - M->getDestAlignment(), false); + if (MI) + Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), + M->getDestAlignment(), false); + else + Builder.CreateElementUnorderedAtomicMemSet(M->getRawDest(), ByteVal, + M->getLength(), + M->getDestAlignment(), + AMI->getElementSizeInBytes()); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; @@ -1210,9 +1254,9 @@ // Try to turn a partially redundant memset + memcpy into // memcpy + smaller memset. We don't need the memcpy size for this. - if (DepInfo.isClobber()) + if (MI && DepInfo.isClobber()) if (MemSetInst *MDep = dyn_cast(DepInfo.getInst())) - if (processMemSetMemCpyDependence(M, MDep)) + if (processMemSetMemCpyDependence(MI, MDep)) return true; // The optimizations after this point require the memcpy size. @@ -1226,12 +1270,12 @@ // lifetime copies undefined data, and we can therefore eliminate the // memcpy in favor of the data that was already at the destination. // d) memcpy from a just-memset'd source can be turned into memset. - if (DepInfo.isClobber()) { + if (MI && DepInfo.isClobber()) { if (CallInst *C = dyn_cast(DepInfo.getInst())) { // FIXME: Can we pass in either of dest/src alignment here instead // of conservatively taking the minimum? unsigned Align = MinAlign(M->getDestAlignment(), M->getSourceAlignment()); - if (performCallSlotOptzn(M, M->getDest(), M->getSource(), + if (performCallSlotOptzn(MI, M->getDest(), M->getSource(), CopySize->getZExtValue(), Align, C)) { MD->removeInstruction(M); @@ -1246,7 +1290,7 @@ SrcLoc, true, M->getIterator(), M->getParent()); if (SrcDepInfo.isClobber()) { - if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) + if (AnyMemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep); } else if (SrcDepInfo.isDef()) { Instruction *I = SrcDepInfo.getInst(); @@ -1270,7 +1314,7 @@ } if (SrcDepInfo.isClobber()) - if (MemSetInst *MDep = dyn_cast(SrcDepInfo.getInst())) + if (AnyMemSetInst *MDep = dyn_cast(SrcDepInfo.getInst())) if (performMemCpyToMemSetOptzn(M, MDep)) { MD->removeInstruction(M); M->eraseFromParent(); @@ -1283,7 +1327,7 @@ /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed /// not to alias. -bool MemCpyOptPass::processMemMove(MemMoveInst *M) { +bool MemCpyOptPass::processAnyMemMove(AnyMemMoveInst *M) { AliasAnalysis &AA = LookupAliasAnalysis(); if (!TLI->has(LibFunc_memmove)) @@ -1301,8 +1345,9 @@ Type *ArgTys[3] = { M->getRawDest()->getType(), M->getRawSource()->getType(), M->getLength()->getType() }; + auto MemCpy = isa(M) ? Intrinsic::memcpy : Intrinsic::memcpy_element_unordered_atomic; M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), - Intrinsic::memcpy, ArgTys)); + MemCpy, ArgTys)); // MemDep may have over conservative information about this instruction, just // conservatively flush it from the cache. @@ -1412,10 +1457,10 @@ MadeChange |= processStore(SI, BI); else if (MemSetInst *M = dyn_cast(I)) RepeatInstruction = processMemSet(M, BI); - else if (MemCpyInst *M = dyn_cast(I)) - RepeatInstruction = processMemCpy(M); - else if (MemMoveInst *M = dyn_cast(I)) - RepeatInstruction = processMemMove(M); + else if (AnyMemCpyInst *M = dyn_cast(I)) + RepeatInstruction = processAnyMemCpy(M); + else if (AnyMemMoveInst *M = dyn_cast(I)) + RepeatInstruction = processAnyMemMove(M); else if (auto CS = CallSite(I)) { for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) if (CS.isByValArgument(i)) Index: test/Transforms/MemCpyOpt/atomic-memcpy.ll =================================================================== --- /dev/null +++ test/Transforms/MemCpyOpt/atomic-memcpy.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -memcpyopt -dse -S | FileCheck -enable-var-scope %s + +; A copy of the memcpy.ll tests, but modified to test the atomic memcpy intrinsic + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i686-apple-darwin9" + +%0 = type { x86_fp80, x86_fp80 } +%1 = type { i32, i32 } + +; Check that the first memcpy is removed +define void @test1(%0* sret %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = fsub x86_fp80 0xK80000000000000000000, [[Z_1:%.*]] +; CHECK-NEXT: call void @ccoshl(%0* sret [[MEMTMP]], x86_fp80 [[TMP5]], x86_fp80 [[Z_0:%.*]]) #0 +; CHECK-NEXT: [[MEMTMP20:%.*]] = bitcast %0* [[MEMTMP]] to i8* +; CHECK-NEXT: [[AGG_RESULT21:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[AGG_RESULT21]], i8* align 16 [[MEMTMP20]], i32 32, i32 1) +; CHECK-NEXT: ret void +; +entry: + %tmp2 = alloca %0 + %memtmp = alloca %0, align 16 + %tmp5 = fsub x86_fp80 0xK80000000000000000000, %z.1 + call void @ccoshl(%0* sret %memtmp, x86_fp80 %tmp5, x86_fp80 %z.0) nounwind + %tmp219 = bitcast %0* %tmp2 to i8* + %memtmp20 = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %tmp219, i8* align 16 %memtmp20, i32 32, i32 1) + %agg.result21 = bitcast %0* %agg.result to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %agg.result21, i8* align 16 %tmp219, i32 32, i32 1) + ret void +} + +declare void @ccoshl(%0* nocapture sret, x86_fp80, x86_fp80) nounwind + + +; The intermediate alloca and one of the memcpy's should be eliminated, the +; other should be replaced with a memmove. +define void @test2(i8* %P, i8* %Q) nounwind { +; CHECK-LABEL: @test2( +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i32 1) +; CHECK-NEXT: ret void +; + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i32 1) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i32 1) + ret void +} + +; The element sizes differ between the two memcpy's. We leave these alone. +define void @test2b(i8* %P, i8* %Q) nounwind { +; CHECK-LABEL: @test2b( +; CHECK-NEXT: [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = bitcast %0* [[MEMTMP]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[R]], i8* align 16 [[P:%.*]], i32 32, i32 1) +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[R]], i32 32, i32 4) +; CHECK-NEXT: ret void +; + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i32 1) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i32 4) + ret void +} + +; The intermediate alloca and one of the memcpy's should be eliminated, the +; other should be replaced with a memcpy.element.unordered.atomic. +define void @test2_memcpy(i8* noalias %P, i8* noalias %Q) nounwind { +; CHECK-LABEL: @test2_memcpy( +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i32 1) +; CHECK-NEXT: ret void +; + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i32 1) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i32 1) + ret void +} + +@x = external global %0 + +define void @test3(%0* noalias sret %agg.result) nounwind { +; CHECK-LABEL: @test3( +; CHECK-NEXT: [[AGG_RESULT2:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[AGG_RESULT2]], i8* align 16 bitcast (%0* @x to i8*), i32 32, i32 1) +; CHECK-NEXT: ret void +; + %x.0 = alloca %0 + %x.01 = bitcast %0* %x.0 to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %x.01, i8* align 16 bitcast (%0* @x to i8*), i32 32, i32 1) + %agg.result2 = bitcast %0* %agg.result to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %agg.result2, i8* align 16 %x.01, i32 32, i32 1) + ret void +} + + +; PR8644 +define void @test4(i8 *%P) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: [[A:%.*]] = alloca [[TMP1:%.*]] +; CHECK-NEXT: [[B:%.*]] = bitcast %1* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 [[B]], i8* align 4 [[P:%.*]], i64 8, i32 1) +; CHECK-NEXT: call void @test4a(i8* byval align 1 [[B]]) +; CHECK-NEXT: ret void +; + %A = alloca %1 + %B = bitcast %1* %A to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %B, i8* align 4 %P, i64 8, i32 1) + call void @test4a(i8* align 1 byval %B) + ret void +} + +; Make sure we don't remove the memcpy if the source address space doesn't match the byval argument +define void @test4_addrspace(i8 addrspace(1)* %P) { +; CHECK-LABEL: @test4_addrspace( +; CHECK-NEXT: [[A:%.*]] = alloca [[TMP1:%.*]] +; CHECK-NEXT: [[B:%.*]] = bitcast %1* [[A]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p1i8.i64(i8* align 4 [[B]], i8 addrspace(1)* align 4 [[P:%.*]], i64 8, i32 1) +; CHECK-NEXT: call void @test4a(i8* byval align 1 [[B]]) +; CHECK-NEXT: ret void +; + %A = alloca %1 + %B = bitcast %1* %A to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p1i8.i64(i8* align 4 %B, i8 addrspace(1)* align 4 %P, i64 8, i32 1) + call void @test4a(i8* align 1 byval %B) + ret void +} + +declare void @test4a(i8* align 1 byval) + +;; Noop memcpy should be zapped. +define void @test6(i8 *%P) { +; CHECK-LABEL: @test6( +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %P, i8* align 4 %P, i64 8, i32 1) + ret void +} + + +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i32) nounwind Index: test/Transforms/MemCpyOpt/memcpy-to-memset.ll =================================================================== --- test/Transforms/MemCpyOpt/memcpy-to-memset.ll +++ test/Transforms/MemCpyOpt/memcpy-to-memset.ll @@ -3,6 +3,7 @@ @cst = internal constant [3 x i32] [i32 -1, i32 -1, i32 -1], align 4 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind declare void @foo(i32*) nounwind define void @test1() nounwind { @@ -17,3 +18,16 @@ ; CHECK-NOT: call void @llvm.memcpy ; CHECK: ret void } + +define void @test1_atomic() nounwind { + %arr = alloca [3 x i32], align 4 + %arr_i8 = bitcast [3 x i32]* %arr to i8* + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %arr_i8, i8* align 4 bitcast ([3 x i32]* @cst to i8*), i64 12, i32 1) + %arraydecay = getelementptr inbounds [3 x i32], [3 x i32]* %arr, i64 0, i64 0 + call void @foo(i32* %arraydecay) nounwind + ret void +; CHECK-LABEL: @test1_atomic( +; CHECK: call void @llvm.memset.element.unordered.atomic.{{.*}}, i32 1) +; CHECK-NOT: call void @llvm.memcpy +; CHECK: ret void +} Index: test/Transforms/MemCpyOpt/memmove.ll =================================================================== --- test/Transforms/MemCpyOpt/memmove.ll +++ test/Transforms/MemCpyOpt/memmove.ll @@ -5,6 +5,7 @@ target triple = "x86_64-apple-darwin9.0" declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind define i8* @test1(i8* nocapture %src) nounwind { entry: @@ -19,6 +20,17 @@ } declare noalias i8* @malloc(i32) +define i8* @test1_atomic(i8* nocapture %src) nounwind { +entry: +; CHECK-LABEL: @test1_atomic( +; CHECK: tail call void @llvm.memcpy.element.unordered.atomic{{.*}}, i32 1) + + %malloccall = tail call i8* @malloc(i32 trunc (i64 mul nuw (i64 ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64), i64 13) to i32)) + %call3 = bitcast i8* %malloccall to [13 x i8]* + %call3.sub = getelementptr inbounds [13 x i8], [13 x i8]* %call3, i64 0, i64 0 + tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %call3.sub, i8* align 1 %src, i64 13, i32 1) + ret i8* %call3.sub +} define void @test2(i8* %P) nounwind { entry: Index: test/Transforms/MemCpyOpt/memset-atomicmemcpy-to-2x-memset.ll =================================================================== --- /dev/null +++ test/Transforms/MemCpyOpt/memset-atomicmemcpy-to-2x-memset.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -memcpyopt -S %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @test(i8* %dst1, i8* %dst2, i8 %c) { +; CHECK-LABEL: @test( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 128, i32 8) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i32 8) + ret void +} + +define void @test2(i8* %dst1, i8* %dst2, i8 %c) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i32 8) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 128, i32 4) +; CHECK-NEXT: ret void +; + call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i32 8) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i32 4) + ret void +} + +define void @test_smaller_memcpy(i8* %dst1, i8* %dst2, i8 %c) { +; CHECK-LABEL: @test_smaller_memcpy( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 2 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 2 [[DST2:%.*]], i8 [[C]], i64 100, i32 2) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* align 2 %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 2 %dst2, i8* align 2 %dst1, i64 100, i32 2) + ret void +} + +define void @test_smaller_memset(i8* %dst1, i8* %dst2, i8 %c) { +; CHECK-LABEL: @test_smaller_memset( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[DST1:%.*]], i8 [[C:%.*]], i64 100, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 [[DST2:%.*]], i8* align 4 [[DST1]], i64 128, i32 4) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* align 4 %dst1, i8 %c, i64 100, i1 false) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %dst2, i8* align 4 %dst1, i64 128, i32 4) + ret void +} + +define void @test_align_memset(i8* %dst1, i8* %dst2, i8 %c) { +; CHECK-LABEL: @test_align_memset( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 128, i32 8) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i32 8) + ret void +} + +define void @test_different_types(i8* %dst1, i8* %dst2, i8 %c) { +; CHECK-LABEL: @test_different_types( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 8 [[DST2:%.*]], i8 [[C]], i32 100, i32 2) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dst2, i8* align 8 %dst1, i32 100, i32 2) + ret void +} + +define void @test_different_types_2(i8* %dst1, i8* %dst2, i8 %c) { +; CHECK-LABEL: @test_different_types_2( +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i32 128, i1 false) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 100, i32 2) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i32(i8* align 8 %dst1, i8 %c, i32 128, i1 false) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 100, i32 2) + ret void +} + +define void @test_different_source_gep(i8* %dst1, i8* %dst2, i8 %c) { +; CHECK-LABEL: @test_different_source_gep( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false) +; CHECK-NEXT: [[P:%.*]] = getelementptr i8, i8* [[DST1]], i64 64 +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 [[DST2:%.*]], i8* align 4 [[P]], i64 64, i32 4) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) + ; FIXME: We could optimize this as well. + %p = getelementptr i8, i8* %dst1, i64 64 + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %dst2, i8* align 4 %p, i64 64, i32 4) + ret void +} + +define void @test_variable_size_1(i8* %dst1, i64 %dst1_size, i8* %dst2, i8 %c) { +; CHECK-LABEL: @test_variable_size_1( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 [[DST1_SIZE:%.*]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[DST2:%.*]], i8* align 1 [[DST1]], i64 128, i32 1) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i1 false) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %dst2, i8* align 1 %dst1, i64 128, i32 1) + ret void +} + +define void @test_variable_size_2(i8* %dst1, i8* %dst2, i64 %dst2_size, i8 %c) { +; CHECK-LABEL: @test_variable_size_2( +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[DST2:%.*]], i8* align 1 [[DST1]], i64 [[DST2_SIZE:%.*]], i32 1) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false) + call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %dst2, i8* align 1 %dst1, i64 %dst2_size, i32 1) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) +declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32) +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) +declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32)