Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -102,6 +102,7 @@ STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); +STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores"); static cl::opt UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", @@ -138,17 +139,18 @@ StoreListMap StoreRefsForMemset; StoreListMap StoreRefsForMemsetPattern; - StoreList StoreRefsForMemcpy; + StoreList StoreRefsForMemcpyOrMemmove; bool HasMemset; bool HasMemsetPattern; bool HasMemcpy; + bool HasMemmove; /// Return code for isLegalStore() enum LegalStoreKind { None = 0, Memset, MemsetPattern, - Memcpy, + MemcpyOrMemmove, UnorderedAtomicMemcpy, DontUse // Dummy retval never to be used. Allows catching errors in retval // handling. @@ -277,7 +279,7 @@ // Disable loop idiom recognition if the function's name is a common idiom. StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy") + if (Name == "memset" || Name == "memcpy" || Name == "memmove") return false; // Determine if code size heuristics need to be applied. @@ -287,8 +289,9 @@ HasMemset = TLI->has(LibFunc_memset); HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); + HasMemmove = TLI->has(LibFunc_memmove); - if (HasMemset || HasMemsetPattern || HasMemcpy) + if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemmove) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -446,8 +449,8 @@ return LegalStoreKind::MemsetPattern; } - // Otherwise, see if the store can be turned into a memcpy. - if (HasMemcpy) { + // Otherwise, see if the store can be turned into a memcpy or memmove. + if (HasMemcpy || HasMemmove) { // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. APInt Stride = getStoreStride(StoreEv); @@ -477,10 +480,12 @@ if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) return LegalStoreKind::None; - // Success. This store can be converted into a memcpy. - UnorderedAtomic = UnorderedAtomic || LI->isAtomic(); - return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy - : LegalStoreKind::Memcpy; + if (UnorderedAtomic || LI->isAtomic()) + return HasMemcpy ? LegalStoreKind::UnorderedAtomicMemcpy + : LegalStoreKind::None; + + // Success. This store can be converted into a memcpy or memmove. + return LegalStoreKind::MemcpyOrMemmove; } // This store can't be transformed into a memset/memcpy. return LegalStoreKind::None; @@ -489,7 +494,7 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) { StoreRefsForMemset.clear(); StoreRefsForMemsetPattern.clear(); - StoreRefsForMemcpy.clear(); + StoreRefsForMemcpyOrMemmove.clear(); for (Instruction &I : *BB) { StoreInst *SI = dyn_cast(&I); if (!SI) @@ -510,9 +515,9 @@ Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); StoreRefsForMemsetPattern[Ptr].push_back(SI); } break; - case LegalStoreKind::Memcpy: + case LegalStoreKind::MemcpyOrMemmove: case LegalStoreKind::UnorderedAtomicMemcpy: - StoreRefsForMemcpy.push_back(SI); + StoreRefsForMemcpyOrMemmove.push_back(SI); break; default: assert(false && "unhandled return value"); @@ -548,7 +553,7 @@ MadeChange |= processLoopStores(SL.second, BECount, false); // Optimize the store into a memcpy, if it feeds an similarly strided load. - for (auto &SI : StoreRefsForMemcpy) + for (auto &SI : StoreRefsForMemcpyOrMemmove) MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { @@ -949,8 +954,8 @@ } /// If the stored value is a strided load in the same loop with the same stride -/// this may be transformable into a memcpy. This kicks in for stuff like -/// for (i) A[i] = B[i]; +/// this may be transformable into a memcpy or memmove. This kicks in for +/// stuff like for (i) A[i] = B[i]; bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount) { assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores."); @@ -991,14 +996,19 @@ // would be unsafe to do if there is anything else in the loop that may read // or write the memory region we're storing to. This includes the load that // feeds the stores. Check for an alias by generating the base address and - // checking everything. + // checking everything. If the access is only a load, then perform a memmove + // instead of a memcpy. Value *StoreBasePtr = Expander.expandCodeFor( StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); SmallPtrSet Stores; Stores.insert(SI); - if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, - StoreSize, *AA, Stores)) { + bool StoreIsModified = mayLoopAccessLocation( + StoreBasePtr, ModRefInfo::Mod, CurLoop, BECount, StoreSize, *AA, Stores); + bool PerformMemmove = mayLoopAccessLocation( + StoreBasePtr, ModRefInfo::Ref, CurLoop, BECount, StoreSize, *AA, Stores); + + if (StoreIsModified || (PerformMemmove && !HasMemmove)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); @@ -1012,8 +1022,8 @@ if (NegStride) LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); - // For a memcpy, we have to make sure that the input array is not being - // mutated by the loop. + // For a memcpy or memmove, we have to make sure that the input array is not + // being mutated by the loop. Value *LoadBasePtr = Expander.expandCodeFor( LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); @@ -1040,9 +1050,19 @@ unsigned Align = std::min(SI->getAlignment(), LI->getAlignment()); CallInst *NewCall = nullptr; // Check whether to generate an unordered atomic memcpy: - // If the load or store are atomic, then they must neccessarily be unordered + // If the load or store are atomic, then they must necessarily be unordered // by previous checks. - if (!SI->isAtomic() && !LI->isAtomic()) + bool IsAtomicLoadOrStore = SI->isAtomic() || LI->isAtomic(); + + // FIXME: We should build an atomic memmove lowering like we have for + // memcpy. + assert((!IsAtomicLoadOrStore || !PerformMemmove) && + "cannot memmove atomic load or store"); + + if (PerformMemmove) + NewCall = Builder.CreateMemMove(StoreBasePtr, Align, LoadBasePtr, Align, + NumBytes); + else if (!IsAtomicLoadOrStore) NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align); else { // We cannot allow unaligned ops for unordered load/store, so reject @@ -1066,14 +1086,19 @@ } NewCall->setDebugLoc(SI->getDebugLoc()); - DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" + + DEBUG(dbgs() << " Formed " << (PerformMemmove ? "memmove: " : "memcpy: ") + << *NewCall << "\n" << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); - // Okay, the memcpy has been formed. Zap the original store and anything that - // feeds into it. + // Okay, the memcpy or memmove has been formed. Zap the original store and + // anything that feeds into it. deleteDeadInstruction(SI); - ++NumMemCpy; + if (PerformMemmove) + ++NumMemMove; + else + ++NumMemCpy; return true; } Index: test/Transforms/LoopIdiom/basic-address-space.ll =================================================================== --- test/Transforms/LoopIdiom/basic-address-space.ll +++ test/Transforms/LoopIdiom/basic-address-space.ll @@ -62,9 +62,10 @@ define i32 @test14() nounwind { ; CHECK-LABEL: @test14( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.memmove.p2i8.p2i8.i16 ; CHECK: for.body: -; CHECK: load i32 -; CHECK: store i32 +; CHECK-NOT: store ; CHECK: br i1 %cmp entry: Index: test/Transforms/LoopIdiom/basic.ll =================================================================== --- test/Transforms/LoopIdiom/basic.ll +++ test/Transforms/LoopIdiom/basic.ll @@ -381,9 +381,10 @@ %tmp8 = load i32, i32* getelementptr inbounds ([7 x i32], [7 x i32]* @g_50, i32 0, i64 6), align 4 ret i32 %tmp8 ; CHECK-LABEL: @test14( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 bitcast (i32* getelementptr inbounds ([7 x i32], [7 x i32]* @g_50, i64 0, i64 5) to i8*), i8* align 4 bitcast (i32* getelementptr inbounds ([7 x i32], [7 x i32]* @g_50, i64 0, i64 4) to i8*), i64 8, i1 false) ; CHECK: for.body: -; CHECK: load i32 -; CHECK: store i32 +; CHECK-NOT: store ; CHECK: br i1 %cmp } @@ -391,8 +392,8 @@ define void @PR14241(i32* %s, i64 %size) { ; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught ; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy -; instead of a memmove. If we get the memmove transform back, this will catch -; regressions. +; instead of a memmove. We now have the memmove transform back. Ensure this +; still doesn't generate a memcpy to catch regressions. ; ; CHECK-LABEL: @PR14241( @@ -401,23 +402,26 @@ %end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx br label %while.body ; CHECK-NOT: memcpy -; -; FIXME: When we regain the ability to form a memmove here, this test should be -; reversed and turned into a positive assertion. -; CHECK-NOT: memmove +; CHECK: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 %s1, i8* align 4 %scevgep2, i64 %4, i1 false) +; CHECK-NEXT: br label %while.body +; CHECK-NOT: memcpy + while.body: +; CHECK: while.body: +; CHECK-NOT: store %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ] %src.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1 %val = load i32, i32* %src.ptr, align 4 ; CHECK: load %dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0 store i32 %val, i32* %dst.ptr, align 4 -; CHECK: store +; CHECK-NOT: store %next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1 %cmp = icmp eq i32* %next.ptr, %end.ptr br i1 %cmp, label %exit, label %while.body +; CHECK: exit: exit: ret void ; CHECK: ret void @@ -632,6 +636,153 @@ ret void } +; Memmove formation tests + +declare void @eat(i8 signext) + +define void @test_memmove_formation(i8* nocapture %dest, i8* nocapture readonly %source, i64 %size) nounwind ssp { +entry: + %cmp8 = icmp sgt i64 %size, 0 + br i1 %cmp8, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %source, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx3 = getelementptr inbounds i8, i8* %dest, i64 %indvars.iv + store i8 %0, i8* %arrayidx3, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +; CHECK-LABEL: @test_memmove_formation( +; CHECK: @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dest, i8* align 1 %source, i64 %size, i1 false) +; CHECK-NOT: store +; CHECK: ret void +} + +define void @test_memmove_two(i8* readonly %begin, i8* readnone %end, i8* nocapture %out) nounwind ssp { +entry: + %cmp1 = icmp eq i8* %begin, %end + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %dest.i = phi i8* [ %dest.next, %for.body ], [ %out, %for.body.preheader ] + %begin.i = phi i8* [ %begin.next, %for.body ], [ %begin, %for.body.preheader ] + %0 = load i8, i8* %begin.i, align 1 + store i8 %0, i8* %dest.i, align 1 + %begin.next = getelementptr inbounds i8, i8* %begin.i, i64 1 + %dest.next = getelementptr inbounds i8, i8* %dest.i, i64 1 + %cmp = icmp eq i8* %begin.next, %end + br i1 %cmp, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +; CHECK-LABEL: @test_memmove_two( +; CHECK: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %out, i8* align 1 %begin +; CHECK-NOT: store +; CHECK: ret void +} + +define void @test_memmove_three(i8* nocapture %arr) nounwind ssp { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds i8, i8* %arr, i64 %indvars.iv.next + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i8, i8* %arr, i64 %indvars.iv + store i8 %0, i8* %arrayidx2, align 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1023 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void + +; CHECK-LABEL: @test_memmove_three +; CHECK: %[[SOURCE:.*]] = getelementptr i8, i8* %arr, i64 1 +; CHECK: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %arr, i8* align 1 %[[SOURCE]], i64 1023, i1 false) +; CHECK-NOT: store +; CHECK: ret void +} + + +; test memmove is not performed since the source array is accessed during the loop. +define void @copy_access_source(i8* readonly %begin, i8* readonly %end, i8* nocapture %out) nounwind ssp { +entry: + %add.ptr = getelementptr inbounds i8, i8* %end, i64 -1 + %cmp7 = icmp eq i8* %begin, %end + br i1 %cmp7, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %begin.i = phi i8* [ %begin.next, %for.body ], [ %begin, %for.body.preheader ] + %dest.i = phi i8* [ %dest.next, %for.body ], [ %out, %for.body.preheader ] + %0 = load i8, i8* %begin.i, align 1 + store i8 %0, i8* %dest.i, align 1 + %1 = load i8, i8* %add.ptr, align 1 + tail call void @eat(i8 signext %1) + %begin.next = getelementptr inbounds i8, i8* %begin.i, i64 1 + %dest.next = getelementptr inbounds i8, i8* %dest.i, i64 1 + %cmp = icmp eq i8* %begin.next, %end + br i1 %cmp, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +; CHECK-LABEL: @copy_access_source +; CHECK-NOT: llvm.memmove +; CHECK-NOT: llvm.memcpy +; CHECK: ret void +} + +; test memmove is not performed since the destination array is accessed during the loop. +define void @copy_access_dest(i8* readonly %begin, i8* readnone %end, i8* nocapture %out) nounwind ssp { +entry: + %add.ptr = getelementptr inbounds i8, i8* %out, i64 1 + %cmp1 = icmp eq i8* %begin, %end + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %begin.i = phi i8* [ %begin.next, %for.body ], [ %begin, %for.body.preheader ] + %dest.i = phi i8* [ %dest.next, %for.body ], [ %out, %for.body.preheader ] + %0 = load i8, i8* %add.ptr, align 1 + tail call void @eat(i8 signext %0) #3 + %1 = load i8, i8* %begin.i, align 1 + store i8 %1, i8* %dest.i, align 1 + %begin.next = getelementptr inbounds i8, i8* %begin.i, i64 1 + %dest.next = getelementptr inbounds i8, i8* %dest.i, i64 1 + %cmp = icmp eq i8* %begin.next, %end + br i1 %cmp, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +; CHECK-LABEL: @copy_access_dest +; CHECK-NOT: llvm.memmove +; CHECK-NOT: llvm.memcpy +; CHECK: ret void +} + + ; Validate that "memset_pattern" has the proper attributes. -; CHECK: declare void @memset_pattern16(i8* nocapture, i8* nocapture readonly, i64) [[ATTRS:#[0-9]+]] -; CHECK: [[ATTRS]] = { argmemonly } +; CHECK-DIAG: declare void @memset_pattern16(i8* nocapture, i8* nocapture readonly, i64) [[MEMSET_ATTRS:#[0-9]+]] +; CHECK-DIAG: [[MEMSET_ATTRS]] = { argmemonly } + +; Validate that "memmove" has the proper attributes +; CHECK-DIAG: declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) [[MEMMOVE_ATTRS:#[0-9]+]] +; CHECK-DIAG: [[MEMMOVE_ATTRS]] = { argmeonly nounwind } +