Index: lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- lib/Transforms/Scalar/DeadStoreElimination.cpp +++ lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -1070,20 +1070,6 @@ if (!hasMemoryWrite(Inst, *TLI)) continue; - // eliminateNoopStore will update in iterator, if necessary. - if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, &InstrOrdering)) { - MadeChange = true; - continue; - } - - // If we find something that writes memory, get its memory dependence. - MemDepResult InstDep = MD->getDependency(Inst); - - // Ignore any store where we can't find a local dependence. - // FIXME: cross-block DSE would be fun. :) - if (!InstDep.isDef() && !InstDep.isClobber()) - continue; - // Figure out what location is being stored to. MemoryLocation Loc = getLocForWrite(Inst, *AA); @@ -1091,6 +1077,9 @@ if (!Loc.Ptr) continue; + // If we find something that writes memory, get its memory dependence. + MemDepResult InstDep = MD->getDependency(Inst); + // Loop until we find a store we can eliminate or a load that // invalidates the analysis. Without an upper bound on the number of // instructions examined, this analysis can become very time-consuming. @@ -1230,6 +1219,7 @@ // Delete the old stores and now-dead instructions that feed them. deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, &InstrOrdering); + Inst = nullptr; deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, &InstrOrdering); MadeChange = true; @@ -1258,6 +1248,11 @@ DepWrite->getIterator(), &BB, /*QueryInst=*/ nullptr, &Limit); } + + // Eliminate noop stores last, after any dead intervening stores are gone + if (Inst /* may be nulled if merged with partial store */ && + eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, &InstrOrdering)) + MadeChange = true; } if (EnablePartialOverwriteTracking) Index: test/Transforms/DeadStoreElimination/late-noop.ll =================================================================== --- test/Transforms/DeadStoreElimination/late-noop.ll +++ test/Transforms/DeadStoreElimination/late-noop.ll @@ -0,0 +1,48 @@ +; Test to make sure noop stores exposed by removal of intervening stores can be removed, +; even when the original load is in a different block +; RUN: opt < %s -basicaa -dse -S | FileCheck %s + +define i32 @foo2(i32* %i, i1 %cond) { +; Both stores removed, first dead, second as late noop +B1: +; CHECK-LABEL: B1 +; CHECK-NEXT: %val.i = load i32, i32* %i, align 4 +; CHECK-NEXT: br i1 %cond + %val.i = load i32, i32* %i, align 4 + %newval.i = add i32 %val.i, 1 + store i32 %newval.i, i32* %i, align 4 + store i32 %val.i, i32* %i, align 4 + br i1 %cond, label %B2, label %B3 + +; Both stores removed, first dead, second as late noop +B2: +; CHECK: B2: +; CHECK-NEXT: br label %B4 + %val.i2 = load i32, i32* %i, align 4 + %newval.i2 = add i32 %val.i2, 1 + store i32 %newval.i2, i32* %i, align 4 + store i32 %val.i, i32* %i, align 4 + br label %B4 + +; Store not dead, no dep in the same block +B3: +; CHECK: B3 +; CHECK-NEXT: %val.i3 = load i32, i32* %i, align 4 +; CHECK-NEXT: %newval.i3 = add i32 %val.i3, 1 +; CHECK-NEXT: store i32 %newval.i3, i32* %i, align 4 +; CHECK-NEXT: br label %B4 + %val.i3 = load i32, i32* %i, align 4 + %newval.i3 = add i32 %val.i3, 1 + store i32 %newval.i3, i32* %i, align 4 + br label %B4 + +; Store not removed as noop due to presence of store in B3 +B4: +; CHECK: B4 +; CHECK-NEXT: store i32 %val.i, i32* %i, align 4 +; CHECK-NEXT: %val.i4 = load i32, i32* %i, align 4 +; CHECK-NEXT: ret i32 %val.i4 + store i32 %val.i, i32* %i, align 4 + %val.i4 = load i32, i32* %i, align 4 + ret i32 %val.i4 +} Index: test/Transforms/DeadStoreElimination/simple.ll =================================================================== --- test/Transforms/DeadStoreElimination/simple.ll +++ test/Transforms/DeadStoreElimination/simple.ll @@ -521,3 +521,19 @@ store i32 0, i32* %p ret void } + +; Basic late noop removal +; CHECK-LABEL: @test36( +; CHECK-NEXT: %val.i2 = load i32, i32* %i, align 4 +; CHECK-NEXT: ret +define i32 @test36(i32* %i) { + %val.i = load i32, i32* %i, align 4 + %newval.i1 = add i32 %val.i, 1 + store i32 %newval.i1, i32* %i, align 4 + %newval.i2 = add i32 %newval.i1, 1 + store i32 %newval.i2, i32* %i, align 4 + store i32 %val.i, i32* %i, align 4 + %val.i2 = load i32, i32* %i, align 4 + ret i32 %val.i2 +} +