Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -760,14 +760,11 @@ Ev, BECount, NegStride, /*IsLoopMemset=*/true); } -/// mayLoopAccessLocation - Return true if the specified loop might access the -/// specified pointer location, which is a loop-strided access. The 'Access' -/// argument specifies what the verboten forms of access are (read or write). -static bool -mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, - const SCEV *BECount, unsigned StoreSize, - AliasAnalysis &AA, - SmallPtrSetImpl &IgnoredStores) { +// getLoopAccessLocation - Creates a MemoryLocation for the memory pointed +// to by Ptr. If BECount is a constant, then the size of the memory is +// BECount * Size; otherwise it is unknown. +static MemoryLocation getLoopAccessLocation(Value *Ptr, const SCEV *BECount, + unsigned Size) { // Get the location that may be stored across the loop. Since the access is // strided positively through memory, we say that the modified location starts // at the pointer and has infinite size. @@ -776,18 +773,29 @@ // If the loop iterates a fixed number of times, we can refine the access size // to be exactly the size of the memset, which is (BECount+1)*StoreSize if (const SCEVConstant *BECst = dyn_cast(BECount)) - AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize; + AccessSize = (BECst->getValue()->getZExtValue() + 1) * Size; // TODO: For this to be really effective, we have to dive into the pointer // operand in the store. Store to &A[i] of 100 will always return may alias // with store of &A[100], we need to StoreLoc to be "A" with size of 100, // which will then no-alias a store to &A[100]. - MemoryLocation StoreLoc(Ptr, AccessSize); + return MemoryLocation(Ptr, AccessSize); +} + +/// mayLoopAccessLocation - Return true if the specified loop might access the +/// specified pointer location, which is a loop-strided access. The 'Access' +/// argument specifies what the verboten forms of access are (read or write). +static bool +mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, + const SCEV *BECount, unsigned StoreSize, + AliasAnalysis &AA, + SmallPtrSetImpl &IgnoredInstrs) { + MemoryLocation StoreLoc = getLoopAccessLocation(Ptr, BECount, StoreSize); for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; ++BI) for (Instruction &I : **BI) - if (IgnoredStores.count(&I) == 0 && + if (IgnoredInstrs.count(&I) == 0 && isModOrRefSet( intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access))) return true; @@ -987,18 +995,21 @@ StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); // Okay, we have a strided store "p[i]" of a loaded value. We can turn - // this into a memcpy in the loop preheader now if we want. However, this - // would be unsafe to do if there is anything else in the loop that may read - // or write the memory region we're storing to. This includes the load that - // feeds the stores. Check for an alias by generating the base address and - // checking everything. + // this into a memcpy/memmove in the loop preheader now if we want. However, + // this would be unsafe to do if there is anything else in the loop that may + // read or write the memory region we're storing to. Check for an alias by + // generating the base address and checking everything except the load and + // store. + // The store may alias the load that feeds the stores; this would be a + // memmove. Value *StoreBasePtr = Expander.expandCodeFor( StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); - SmallPtrSet Stores; - Stores.insert(SI); + SmallPtrSet MemoryOps; + MemoryOps.insert(SI); + MemoryOps.insert(LI); if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, - StoreSize, *AA, Stores)) { + StoreSize, *AA, MemoryOps)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); @@ -1012,13 +1023,14 @@ if (NegStride) LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); - // For a memcpy, we have to make sure that the input array is not being - // mutated by the loop. + // For a memcpy/memmove, we have to make sure that the input array is not + // being mutated by the loop except by the store. Value *LoadBasePtr = Expander.expandCodeFor( LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); - + SmallPtrSet StoreOp; + StoreOp.insert(SI); if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, - StoreSize, *AA, Stores)) { + StoreSize, *AA, StoreOp)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); @@ -1029,6 +1041,13 @@ if (avoidLIRForMultiBlockLoop()) return false; + // Generate a memmove, instead of memcpy, if the store instruction + // is aliased with the memory region referenced by the load. + MemoryLocation LoadLoc = + getLoopAccessLocation(LoadBasePtr, BECount, StoreSize); + const bool IsMemMove = isModOrRefSet( + intersectModRef(AA->getModRefInfo(SI, LoadLoc), ModRefInfo::Mod)); + // Okay, everything is safe, we can transform this! const SCEV *NumBytesS = @@ -1041,10 +1060,19 @@ // Check whether to generate an unordered atomic memcpy: // If the load or store are atomic, then they must neccessarily be unordered // by previous checks. - if (!SI->isAtomic() && !LI->isAtomic()) - NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(), - LoadBasePtr, LI->getAlignment(), NumBytes); - else { + if (!SI->isAtomic() && !LI->isAtomic()) { + if (IsMemMove) + NewCall = + Builder.CreateMemMove(StoreBasePtr, SI->getAlignment(), LoadBasePtr, + LI->getAlignment(), NumBytes); + else + NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(), + LoadBasePtr, LI->getAlignment(), NumBytes); + } else { + // TODO: Implement for atomic memmove + if (IsMemMove) + return false; + // We cannot allow unaligned ops for unordered load/store, so reject // anything where the alignment isn't at least the element size. unsigned Align = std::min(SI->getAlignment(), LI->getAlignment()); @@ -1067,7 +1095,8 @@ } NewCall->setDebugLoc(SI->getDebugLoc()); - DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" + DEBUG(dbgs() << " Formed " << (IsMemMove ? "memmove: " : "memcpy: ") + << *NewCall << "\n" << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); Index: test/Transforms/LoopIdiom/basic-address-space.ll =================================================================== --- test/Transforms/LoopIdiom/basic-address-space.ll +++ test/Transforms/LoopIdiom/basic-address-space.ll @@ -62,9 +62,10 @@ define i32 @test14() nounwind { ; CHECK-LABEL: @test14( +; CHECK: memmove ; CHECK: for.body: ; CHECK: load i32 -; CHECK: store i32 +; CHECK-NOT: store i32 ; CHECK: br i1 %cmp entry: Index: test/Transforms/LoopIdiom/basic.ll =================================================================== --- test/Transforms/LoopIdiom/basic.ll +++ test/Transforms/LoopIdiom/basic.ll @@ -456,9 +456,10 @@ %tmp8 = load i32, i32* getelementptr inbounds ([7 x i32], [7 x i32]* @g_50, i32 0, i64 6), align 4 ret i32 %tmp8 ; CHECK-LABEL: @test14( +; CHECK: memmove ; CHECK: for.body: ; CHECK: load i32 -; CHECK: store i32 +; CHECK-NOT: store i32 ; CHECK: br i1 %cmp } @@ -479,7 +480,7 @@ ; ; FIXME: When we regain the ability to form a memmove here, this test should be ; reversed and turned into a positive assertion. -; CHECK-NOT: memmove +; CHECK: memmove while.body: %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ] @@ -488,7 +489,7 @@ ; CHECK: load %dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0 store i32 %val, i32* %dst.ptr, align 4 -; CHECK: store +; CHECK-NOT: store %next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1 %cmp = icmp eq i32* %next.ptr, %end.ptr br i1 %cmp, label %exit, label %while.body Index: test/Transforms/LoopIdiom/memmove.ll =================================================================== --- /dev/null +++ test/Transforms/LoopIdiom/memmove.ll @@ -0,0 +1,157 @@ +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +target triple = "x86_64-apple-darwin10.0.0" + +;; memmove formation +define void @test1(i64 %Size) nounwind ssp { +bb.nph: + %Base = alloca i8, i32 10000 + %Dest = getelementptr inbounds i8, i8* %Base, i32 100 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar + %DestI = getelementptr i8, i8* %Dest, i64 %indvar + %V = load i8, i8* %I.0.014, align 1 + store i8 %V, i8* %DestI, align 1 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +; CHECK-LABEL: @test1( +; CHECK: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i1 false) +; CHECK-NOT: store +; CHECK: ret void +} + +declare i8* @external(i8*) + +;; This cannot be transformed into a memmove, because the read-from location is +;; mutated by the loop. +define void @test2(i64 %Size) nounwind ssp { +bb.nph: + %Base = alloca i8, i32 10000 + %Dest = getelementptr inbounds i8, i8* %Base, i32 100 + + %BaseAlias = call i8* @external(i8* %Base) + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar + %DestI = getelementptr i8, i8* %Dest, i64 %indvar + %V = load i8, i8* %I.0.014, align 1 + store i8 %V, i8* %DestI, align 1 + + ;; This store can clobber the input. + store i8 4, i8* %BaseAlias + + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +; CHECK-LABEL: @test2( +; CHECK-NOT: llvm.memmove +; CHECK: ret void +} + +; Handle memmove-able loops with negative stride. +define noalias i32* @test3(i32* nocapture readonly %a, i32 %c) { +entry: + %0 = getelementptr inbounds i32, i32* %a, i32 %c + %tobool.9 = icmp eq i32 %c, 0 + br i1 %tobool.9, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %dec10.in = phi i32 [ %dec10, %while.body ], [ %c, %while.body.preheader ] + %dec10 = add nsw i32 %dec10.in, -1 + %idxprom = sext i32 %dec10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %0, i64 %idxprom + store i32 %1, i32* %arrayidx2, align 4 + %tobool = icmp eq i32 %dec10, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + ret i32* %0 +; CHECK-LABEL: @test3( +; CHECK: call void @llvm.memmove +; CHECK: ret i32* +} + +; Handle memmove-able loops with negative stride. +; void test18(unsigned * a, unsigned * b) { +; for (int i = 2047; i >= 0; --i) { +; a[i] = b[i]; +; } +; } +define void @test4(i32* nocapture %a, i32* nocapture readonly %b) #0 { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 2047, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret void +; CHECK-LABEL: @test4( +; CHECK: call void @llvm.memmove +; CHECK-NOT: store +; CHECK: ret +} + +define void @form_memmove_narrow_size(i64* %dst, i64* %src, i32 %size) { +; CHECK-LABEL: @form_memmove_narrow_size( +entry: + %cmp1 = icmp sgt i32 %size, 0 + br i1 %cmp1, label %loop.ph, label %exit +; CHECK: entry: +; CHECK: %[[C1:.*]] = icmp sgt i32 %size, 0 +; CHECK-NEXT: br i1 %[[C1]], label %loop.ph, label %exit + +loop.ph: + br label %loop.body +; CHECK: loop.ph: +; CHECK-NEXT: %[[ZEXT_SIZE:.*]] = zext i32 %size to i64 +; CHECK-NEXT: %[[SCALED_SIZE:.*]] = shl i64 %[[ZEXT_SIZE]], 3 +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 %{{.*}}, i8* align 8 %{{.*}}, i64 %[[SCALED_SIZE]], i1 false) + +loop.body: + %storemerge4 = phi i32 [ 0, %loop.ph ], [ %inc, %loop.body ] + %idxprom1 = sext i32 %storemerge4 to i64 + %arrayidx1 = getelementptr inbounds i64, i64* %src, i64 %idxprom1 + %v = load i64, i64* %arrayidx1, align 8 + %idxprom2 = sext i32 %storemerge4 to i64 + %arrayidx2 = getelementptr inbounds i64, i64* %dst, i64 %idxprom2 +; CHECK-NOT: store i64 + store i64 %v, i64* %arrayidx2, align 8 + %inc = add nsw i32 %storemerge4, 1 + %cmp2 = icmp slt i32 %inc, %size + br i1 %cmp2, label %loop.body, label %loop.exit + +loop.exit: + br label %exit + +exit: + ret void +}