Index: lib/Analysis/MemoryDependenceAnalysis.cpp =================================================================== --- lib/Analysis/MemoryDependenceAnalysis.cpp +++ lib/Analysis/MemoryDependenceAnalysis.cpp @@ -1180,34 +1180,36 @@ // that we don't already have conflicting results for these blocks. Check // to ensure that if a block in the results set is in the visited set that // it was for the same pointer query. + bool ConflictingResults = false; if (!Visited.empty()) { for (auto &Entry : *Cache) { DenseMap::iterator VI = Visited.find(Entry.getBB()); - if (VI == Visited.end() || VI->second == Pointer.getAddr()) - continue; - - // We have a pointer mismatch in a block. Just return false, saying - // that something was clobbered in this result. We could also do a - // non-fully cached query, but there is little point in doing this. - return false; + if (VI != Visited.end() && VI->second != Pointer.getAddr()) { + // We have a pointer mismatch in a block. Discard the cached + // information and recalculate. + ConflictingResults = true; + break; + } } } - Value *Addr = Pointer.getAddr(); - for (auto &Entry : *Cache) { - Visited.insert(std::make_pair(Entry.getBB(), Addr)); - if (Entry.getResult().isNonLocal()) { - continue; - } + if (!ConflictingResults) { + Value *Addr = Pointer.getAddr(); + for (auto &Entry : *Cache) { + Visited.insert(std::make_pair(Entry.getBB(), Addr)); + if (Entry.getResult().isNonLocal()) { + continue; + } - if (DT.isReachableFromEntry(Entry.getBB())) { - Result.push_back( + if (DT.isReachableFromEntry(Entry.getBB())) { + Result.push_back( NonLocalDepResult(Entry.getBB(), Entry.getResult(), Addr)); + } } + ++NumCacheCompleteNonLocalPtr; + return true; } - ++NumCacheCompleteNonLocalPtr; - return true; } // Otherwise, either this is a new block, a block with an invalid cache @@ -1219,9 +1221,6 @@ else CacheInfo->Pair = BBSkipFirstBlockPair(); - SmallVector Worklist; - Worklist.push_back(StartBB); - // PredList used inside loop. SmallVector, 16> PredList; @@ -1231,17 +1230,23 @@ // won't get any reuse from currently inserted values, because we don't // revisit blocks after we insert info for them. unsigned NumSortedEntries = Cache->size(); - unsigned WorklistEntries = BlockNumberLimit; - bool GotWorklistLimit = false; LLVM_DEBUG(AssertSorted(*Cache)); - while (!Worklist.empty()) { - BasicBlock *BB = Worklist.pop_back_val(); - + // If we find a NonLocalDepResult without recursing then it will be marked as + // coming from StartBB and no blocks will be marked as visited. Keep track of + // the blocks we've seen so that if we do need to recurse we can then mark + // them as visited. We also need to remember if we've seen a visited block, as + // in that case we can't recurse (as we would need to mark a block as visited + // by two different addresses). + bool SeenVisited = false; + SmallPtrSet SeenBlocks; + + // Follow a chain of blocks starting at StartBB. + for (BasicBlock *BB = StartBB, *NextBB = nullptr; BB; + BB = NextBB, NextBB = nullptr) { // If we do process a large number of blocks it becomes very expensive and // likely it isn't worth worrying about if (Result.size() > NumResultsLimit) { - Worklist.clear(); // Sort it now (if needed) so that recursive invocations of // getNonLocalPointerDepFromBB and other routines that could reuse the // cache value will only see properly sorted cache arrays. @@ -1260,7 +1265,8 @@ if (!SkipFirstBlock) { // Analyze the dependency of *Pointer in FromBB. See if we already have // been here. - assert(Visited.count(BB) && "Should check 'visited' before adding to WL"); + assert((Visited.count(BB) || SeenBlocks.count(BB)) && + "Should check 'visited' before adding to WL"); // Get the dependency info for Pointer in BB. If we have cached // information, we will use it, otherwise we compute it. @@ -1271,56 +1277,70 @@ // If we got a Def or Clobber, add this to the list of results. if (!Dep.isNonLocal()) { if (DT.isReachableFromEntry(BB)) { - Result.push_back(NonLocalDepResult(BB, Dep, Pointer.getAddr())); + // We found Dep in BB, but we mark it as being available from StartBB + // as there may be more than one Dep that can be found in BB but that + // we want available on different paths. + Result.push_back(NonLocalDepResult(StartBB, Dep, Pointer.getAddr())); continue; } } } - // If 'Pointer' is an instruction defined in this block, then we need to do - // phi translation to change it into a value live in the predecessor block. - // If not, we just add the predecessors to the worklist and scan them with - // the same Pointer. - if (!Pointer.NeedsPHITranslationFromBlock(BB)) { - SkipFirstBlock = false; - SmallVector NewBlocks; + // There are three situations in which we need to recurse: + // * We need to phi-translate, as we need to use a different Pointer for + // each incoming value. + // * This is the first block, as results need to come from predecessors to + // that block. + // * We think that we may find a different result for different + // predecessors. + // The first two are easy to detect, but we need to do a little work to + // handle the third. + if (!Pointer.NeedsPHITranslationFromBlock(BB) && !SkipFirstBlock) { + // Check if we have a single predecessor that we haven't already seen. + BasicBlock *SinglePred = nullptr; + bool ManyPred = false; for (BasicBlock *Pred : PredCache.get(BB)) { - // Verify that we haven't looked at this block yet. - std::pair::iterator, bool> InsertRes = - Visited.insert(std::make_pair(Pred, Pointer.getAddr())); - if (InsertRes.second) { - // First time we've looked at *PI. - NewBlocks.push_back(Pred); + if (SeenBlocks.count(Pred)) continue; + if (SinglePred) { + ManyPred = true; + break; } - - // If we have seen this block before, but it was with a different - // pointer then we have a phi translation failure and we have to treat - // this as a clobber. - if (InsertRes.first->second != Pointer.getAddr()) { - // Make sure to clean up the Visited map before continuing on to - // PredTranslationFailure. - for (unsigned i = 0; i < NewBlocks.size(); i++) - Visited.erase(NewBlocks[i]); - goto PredTranslationFailure; - } + SinglePred = Pred; } - if (NewBlocks.size() > WorklistEntries) { - // Make sure to clean up the Visited map before continuing on to - // PredTranslationFailure. - for (unsigned i = 0; i < NewBlocks.size(); i++) - Visited.erase(NewBlocks[i]); - GotWorklistLimit = true; - goto PredTranslationFailure; + // If there are no predecessors that we haven't already seen then we don't + // need to do anything. + if (!SinglePred && !ManyPred) + continue; + // If we found a single predecessor then we can continue without + // recursing. + if (SinglePred && !ManyPred) { + SeenBlocks.insert(SinglePred); + // Check if we've visited this block using a different pointer. + auto It = Visited.find(SinglePred); + if (It != Visited.end() && It->second != Pointer.getAddr()) + SeenVisited = true; + NextBB = SinglePred; + continue; } - WorklistEntries -= NewBlocks.size(); - Worklist.append(NewBlocks.begin(), NewBlocks.end()); - continue; + // If we found more than one predecessor then different values may be + // found in those predecessors, so we need to recurse in order for them to + // be marked as available in those predecessors and not here. } - // We do need to do phi translation, if we know ahead of time we can't phi + // If we've seen a block that's already been visited then we can't continue + // as we would have to mark it as visited by two different pointers. + if (SeenVisited) + goto PredTranslationFailure; + + // Blocks that we've seen now have to be marked as visited by this pointer. + for (BasicBlock *LookedBB : SeenBlocks) + Visited[LookedBB] = Pointer.getAddr(); + + // If we need to do phi translation, and we know ahead of time we can't phi // translate this value, don't even try. - if (!Pointer.IsPotentiallyPHITranslatable()) + if (Pointer.NeedsPHITranslationFromBlock(BB) && + !Pointer.IsPotentiallyPHITranslatable()) goto PredTranslationFailure; // We may have added values to the cache list before this PHI translation. @@ -1341,7 +1361,8 @@ // Get the PHI translated pointer in this predecessor. This can fail if // not translatable, in which case the getAddr() returns null. PHITransAddr &PredPointer = PredList.back().second; - PredPointer.PHITranslateValue(BB, Pred, &DT, /*MustDominate=*/false); + if (PredPointer.NeedsPHITranslationFromBlock(BB)) + PredPointer.PHITranslateValue(BB, Pred, &DT, /*MustDominate=*/false); Value *PredPtrVal = PredPointer.getAddr(); // Check to see if we have already visited this pred block with another @@ -1460,11 +1481,11 @@ bool foundBlock = false; for (NonLocalDepEntry &I : llvm::reverse(*Cache)) { - if (I.getBB() != BB) + if (I.getBB() != StartBB) continue; - assert((GotWorklistLimit || I.getResult().isNonLocal() || - !DT.isReachableFromEntry(BB)) && + assert((I.getResult().isNonLocal() || + !DT.isReachableFromEntry(StartBB)) && "Should only be here with transparent block"); foundBlock = true; I.setResult(MemDepResult::getUnknown()); @@ -1472,8 +1493,8 @@ NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr())); break; } - (void)foundBlock; (void)GotWorklistLimit; - assert((foundBlock || GotWorklistLimit) && "Current block not in cache?"); + (void)foundBlock; + assert(foundBlock && "Current block not in cache?"); } // Okay, we're done now. If we added new values to the cache, re-sort it. Index: test/Transforms/GVN/PRE/multiple-paths.ll =================================================================== --- /dev/null +++ test/Transforms/GVN/PRE/multiple-paths.ll @@ -0,0 +1,393 @@ +; RUN: opt -gvn -S < %s | FileCheck %s + +; Available values come from the same block, and are available in the direct +; predecessors. +define i32 @direct_predecessors_same_block(i32* %ptr1, i32* %ptr2) { +; CHECK-LABEL: @direct_predecessors_same_block +entry: + %val1 = load i32, i32* %ptr1 + %val2 = load i32, i32* %ptr2 + %cmp = icmp slt i32 %val1, %val2 + br i1 %cmp, label %if, label %then + +if: + br label %then + +then: +; CHECK-LABEL: then: +; CHECK: %ret = phi i32 [ %val1, %if ], [ %val2, %entry ] + %phi = phi i32* [ %ptr1, %if ], [ %ptr2, %entry ] + %ret = load i32, i32* %phi + ret i32 %ret +} + +; Available values come from the same block, but are available in indirect +; predecessors. +define i32 @indirect_predecessors_same_block(i32* %ptr1, i32* %ptr2) { +; CHECK-LABEL: @indirect_predecessors_same_block +entry: + br label %loop + +loop: + %ptr1.new = phi i32* [ %ptr1, %entry], [ %ptr1.next, %then ] + %ptr2.new = phi i32* [ %ptr2, %entry], [ %ptr2.next, %then ] + %ptr1.next = getelementptr i32, i32* %ptr1, i32 1 + %ptr2.next = getelementptr i32, i32* %ptr2, i32 1 + %val1 = load i32, i32* %ptr1.new + %val2 = load i32, i32* %ptr2.new + %cmp = icmp slt i32 %val1, %val2 + br i1 %cmp, label %if, label %then + +if: + br label %then + +then: +; CHECK-LABEL: then: +; CHECK: %ret = phi i32 [ %val1, %if ], [ %val2, %loop ] + %phi = phi i32* [ %ptr1.new, %if ], [ %ptr2.new, %loop ] + br i1 undef, label %loop, label %end + +end: + %ret = load i32, i32* %phi + ret i32 %ret +} + +; Available values come from different blocks, and are available in the direct +; predecessors. +define i32 @direct_predecessors_different_blocks(i32* %ptr1, i32* %ptr2) { +; CHECK-LABEL: @direct_predecessors_different_blocks +entry: + %val1 = load i32, i32* %ptr1 + %cmp1 = icmp slt i32 %val1, 0 + br i1 %cmp1, label %if, label %then + +if: + %val2 = load i32, i32* %ptr2 + %cmp2 = icmp slt i32 %val2, 0 + br i1 %cmp2, label %else, label %then + +then: + br label %end + +else: + br label %end + +end: +; CHECK-LABEL: end: +; CHECK: %ret = phi i32 [ %val1, %then ], [ %val2, %else ] + %phi = phi i32* [ %ptr1, %then ], [ %ptr2, %else ] + %ret = load i32, i32* %phi + ret i32 %ret +} + +; Available values come from different blocks, and those blocks are indirect +; predecessors. +define i32 @indirect_predecessors_different_blocks(i32* %ptr1, i32* %ptr2) { +; CHECK-LABEL: @indirect_predecessors_different_blocks +entry: + br label %loop + +loop: + br i1 undef, label %if, label %else + +if: + %val1 = load i32, i32* %ptr1 + %add1 = add i32 %val1, 1 + br i1 undef, label %if.next, label %loop + +if.next: + br label %then + +else: + %val2 = load i32, i32* %ptr2 + %add2 = add i32 %val2, 2 + br i1 undef, label %else.next, label %loop + +else.next: + br label %then + +then: +; CHECK-LABEL: then: +; CHECK: %val = phi i32 [ %val1, %if.next ], [ %val2, %else.next ] + %add = phi i32 [ %add1, %if.next ], [ %add2, %else.next ] + %phi = phi i32* [ %ptr1, %if.next ], [ %ptr2, %else.next ] + br i1 undef, label %loop, label %end + +end: + %val = load i32, i32* %phi + %ret = add i32 %val, %add + ret i32 %ret +} + +; Both val1 and val2 are available from %if, depending on the path taken (to +; %then or to %end). +define i32 @multiple_values_same_block_1(i32* %ptr1, i32* %ptr2, i32* %ptr3) { +; CHECK-LABEL: @multiple_values_same_block_1 +entry: + %val1 = load i32, i32* %ptr1, align 4 + %val2 = load i32, i32* %ptr2, align 4 + %val3 = load i32, i32* %ptr3, align 4 + %cmp1 = icmp slt i32 %val1, %val2 + br i1 %cmp1, label %if, label %then + +if: + %cmp2 = icmp slt i32 %val1, %val3 + br i1 %cmp2, label %end, label %then + +then: +; CHECK-LABEL: then: +; CHECK: %[[PHI:.*]] = phi i32 [ %val2, %if ], [ %val3, %entry ] + %phi = phi i32* [ %ptr2, %if ], [ %ptr3, %entry ] + br label %end + +end: +; CHECK-LABEL: end: +; CHECK: %ret = phi i32 [ %val1, %if ], [ %[[PHI]], %then ] + %retptr = phi i32* [ %ptr1, %if ], [ %phi, %then ] + %ret = load i32, i32* %retptr, align 4 + ret i32 %ret +} + +define i32 @multiple_values_same_block_2(i32* %ptr1, i32* %ptr2, i32* %ptr3) { +; CHECK-LABEL: @multiple_values_same_block_2 +entry: + %val1 = load i32, i32* %ptr1, align 4 + %val2 = load i32, i32* %ptr2, align 4 + %val3 = load i32, i32* %ptr3, align 4 + %cmp1 = icmp slt i32 %val1, %val2 + br i1 %cmp1, label %if, label %then + +if: + %cmp2 = icmp slt i32 %val1, %val3 + br i1 %cmp2, label %end, label %then + +then: +; CHECK-LABEL: then: +; CHECK: %[[PHI:.*]] = phi i32 [ %val2, %if ], [ %val3, %entry ] + %phi = phi i32* [ %ptr2, %if ], [ %ptr3, %entry ] + br label %end + +end: +; CHECK-LABEL: end: +; CHECK: %ret = phi i32 [ %[[PHI]], %then ], [ %val1, %if ] + %retptr = phi i32* [ %phi, %then ], [ %ptr1, %if ] + %ret = load i32, i32* %retptr, align 4 + ret i32 %ret +} + + +; *%ptr1 available in %then1, *%ptr2 available in a predecessor of %then1. +define i32 @available_before_available(i32* %ptr1, i32* %ptr2) { +; CHECK-LABEL: @available_before_available +entry: + %val2 = load i32, i32* %ptr2 + %cmp2 = icmp slt i32 %val2, 0 + br i1 %cmp2, label %if1, label %else1 + +if1: + br label %then1 + +else1: + br label %then1 + +then1: + %val1 = load i32, i32* %ptr1 + %cmp1 = icmp slt i32 %val1, 0 + br i1 %cmp1, label %if2, label %then2 + +if2: + br label %then2 + +then2: +; CHECK-LABEL: then2: +; CHECK: %ret = phi i32 [ %val2, %if2 ], [ %val1, %then1 ] + %phi = phi i32* [ %ptr2, %if2 ], [ %ptr1, %then1 ] + %ret = load i32, i32* %phi + ret i32 %ret; +} + +; *%ptr1 available in %then1, *%ptr2 we find non-function-local in a predecessor +; of %then1. +define i32 @unavailable_before_available(i32* %ptr1, i32* %ptr2) { +; CHECK-LABEL: @unavailable_before_available +entry: + br i1 undef, label %if1, label %else1 + +if1: + br label %then1 + +else1: + br label %then1 + +then1: + %val1 = load i32, i32* %ptr1 + %cmp = icmp slt i32 %val1, 0 + br i1 %cmp, label %if2, label %then2 + +if2: +; CHECK-LABEL: if2: +; CHECK: %[[PRE:.*]] = load i32, i32* %ptr2 + br label %then2 + +then2: +; CHECK-LABEL: then2: +; CHECK: %ret = phi i32 [ %[[PRE]], %if2 ], [ %val1, %then1 ] + %phi = phi i32* [ %ptr2, %if2 ], [ %ptr1, %then1 ] + %ret = load i32, i32* %phi + ret i32 %ret; +} + + +; Reduced test taken from inlined std::min_element. We should end up with a load +; of %p only on the loop backedge. +define float @min_element(float* %first, float* %last) { +; CHECK-LABEL: @min_element +entry: +; CHECK-LABEL: entry: +; CHECK: %[[FIRSTVAL:.*]] = load float, float* %first, align 4 + br label %for.body + +for.body: +; CHECK-LABEL: for.body: +; CHECK-NOT: load + %p = phi float* [ %first, %entry ], [ %incdec.ptr, %for.inc ] + %first.addr.1 = phi float* [ %first, %entry ], [ %first.addr.2, %for.inc ] + %pval = load float, float* %p, align 4 + %firstval = load float, float* %first.addr.1, align 4 + %cmp1 = fcmp olt float %pval, %firstval + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + br label %for.inc + +for.inc: +; CHECK-LABEL: for.inc: +; TODO check phi exactly +; CHECK: %ret = phi float +; CHECK: br i1 %cmp, label %exit, label %[[CRIT_EDGE:.*]] + %first.addr.2 = phi float* [ %p, %if.then ], [ %first.addr.1, %for.body ] + %incdec.ptr = getelementptr inbounds float, float* %p, i64 1 + %cmp = icmp eq float* %incdec.ptr, %last + br i1 %cmp, label %exit, label %for.body + +; CHECK: [[CRIT_EDGE]]: +; CHECK: load float, float* %incdec.ptr, align 4 + +exit: +; CHECK-LABEL: exit: +; CHECK-NOT: load + %ret = load float, float* %first.addr.2, align 4 + ret float %ret +} + +; An inner loop with multiple exits. We should find a value through the loop on +; both exits. +define void @inner_loop_multiple_exit(i64** %ptr) { +; CHECK-LABEL: @inner_loop_multiple_exit +entry: +; CHECK-LABEL: entry: +; CHECK: br i1 undef, label %[[CRIT_EDGE:.*]], label %for.body.preheader + br i1 undef, label %end, label %for.body.preheader + +; CHECK: [[CRIT_EDGE]]: +; CHECK: %[[P2_PRE:.*]] = load i64*, i64** %ptr, align 8 +; CHECK: br label %end + +for.body.preheader: +; CHECK-LABEL: for.body.preheader: +; CHECK: %[[P1_PRE:.*]] = load i64*, i64** %ptr, align 8 +; CHECK: %[[P1_VAL_PRE:.*]] = load i64, i64* %[[P1_PRE]], align 8 + br label %for.body + +for.body: +; CHECK-LABEL: for.body: +; CHECK-NOT: load +; CHECK: %cmp = icmp sgt i64 %[[P1_VAL_PRE]], 0 + %p1 = load i64*, i64** %ptr, align 8 + %p1_val = load i64, i64* %p1, align 8 + %cmp = icmp sgt i64 %p1_val, 0 + br i1 %cmp, label %exit1, label %loop.preheader + +loop.preheader: + br label %loop.top + +loop.top: + br i1 undef, label %exit1, label %loop.bottom + +loop.bottom: + br i1 undef, label %exit2, label %loop.top + +exit1: + br label %for.end + +exit2: + br label %for.end + +for.end: + br i1 undef, label %for.body, label %end + +end: +; CHECK-LABEL: end: +; CHECK: %[[PHI:.*]] = phi i64* [ %[[P2_PRE]], %[[CRIT_EDGE]] ], [ %[[P1_PRE]], %for.end ] +; CHECK-NOT: load +; CHECK: store i64 0, i64* %[[PHI]], align 8 + %p2 = load i64*, i64** %ptr, align 8 + store i64 0, i64* %p2, align 8 + ret void +} + +; As above plus there's a branch that's known to be not taken which doesn't need +; a value. +define void @inner_loop_multiple_exit_not_taken_branch(i64** %ptr) { +; CHECK-LABEL: @inner_loop_multiple_exit_not_taken_branch +entry: +; CHECK-LABEL: entry: +; CHECK: br i1 false, label %[[CRIT_EDGE:.*]], label %for.body.preheader + br i1 false, label %end, label %for.body.preheader + +; CHECK: [[CRIT_EDGE]]: +; CHECK-NOT: load +; CHECK: br label %end + +for.body.preheader: +; CHECK-LABEL: for.body.preheader: +; CHECK: %[[P1_PRE:.*]] = load i64*, i64** %ptr, align 8 +; CHECK: %[[P1_VAL_PRE:.*]] = load i64, i64* %[[P1_PRE]], align 8 + br label %for.body + +for.body: +; CHECK-LABEL: for.body: +; CHECK-NOT: load +; CHECK: %cmp = icmp sgt i64 %[[P1_VAL_PRE]], 0 + %p1 = load i64*, i64** %ptr, align 8 + %p1_val = load i64, i64* %p1, align 8 + %cmp = icmp sgt i64 %p1_val, 0 + br i1 %cmp, label %exit1, label %loop.preheader + +loop.preheader: + br label %loop.top + +loop.top: + br i1 undef, label %exit1, label %loop.bottom + +loop.bottom: + br i1 undef, label %exit2, label %loop.top + +exit1: + br label %for.end + +exit2: + br label %for.end + +for.end: + br i1 undef, label %for.body, label %end + +end: +; CHECK-LABEL: end: +; CHECK: %[[PHI:.*]] = phi i64* [ undef, %[[CRIT_EDGE]] ], [ %[[P1_PRE]], %for.end ] +; CHECK-NOT: load +; CHECK: store i64 0, i64* %[[PHI]], align 8 + %p2 = load i64*, i64** %ptr, align 8 + store i64 0, i64* %p2, align 8 + ret void +} Index: test/Transforms/GVN/rle-no-phi-translate.ll =================================================================== --- test/Transforms/GVN/rle-no-phi-translate.ll +++ test/Transforms/GVN/rle-no-phi-translate.ll @@ -1,7 +1,4 @@ ; RUN: opt < %s -gvn -S | FileCheck %s -; XFAIL: * -; FIXME: This should be promotable, but memdep/gvn don't track values -; path/edge sensitively enough. target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin7"