This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/Transforms/Scalar/
-
Transforms/
-
Scalar/
-
DeadStoreElimination.cpp
-
test/Transforms/DeadStoreElimination/
-
Transforms/
-
DeadStoreElimination/
-
combined-partial-overwrites.ll

Differential D18586

Allow DeadStoreElimination to track combinations of partial later wrties
ClosedPublic

Authored by hfinkel on Mar 29 2016, 5:41 PM.

Download Raw Diff

Details

Reviewers

chandlerc
igor-laevsky
• dberlin
eeckstein
echristo

Commits

rGa1271036c59a: Allow DeadStoreElimination to track combinations of partial later wrties
rL273559: Allow DeadStoreElimination to track combinations of partial later wrties

Summary

DeadStoreElimination can currently remove a small store rendered unnecessary by a later larger one, but cannot remove a larger store rendered unnecessary by a series of later smaller ones. This patch aims to rectify that.

It works by keeping an IntervalMap for each store later overwritten only partially, and filling in that interval map as more such stores are discovered. No additional walking or aliasing queries are added. If the IntervalMap forms an interval covering the the entire earlier store, then it is dead and can be removed.

I discovered this problem when investigating a performance issue with code like this on PowerPC:

#include <complex>
using namespace std;

complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
  return bar(C)*C;
}

which produces this:

define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
  %ref.tmp = alloca i64, align 8
  %tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
  %c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
  %c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
  %0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
  %c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
  %1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
  call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
  %2 = bitcast %"struct.std::complex"* %agg.result to i64*
  %3 = load i64, i64* %ref.tmp, align 8
  store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
  %_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
  %4 = lshr i64 %3, 32
  %5 = trunc i64 %4 to i32
  %6 = bitcast i32 %5 to float
  %_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
  %7 = trunc i64 %3 to i32
  %8 = bitcast i32 %7 to float
  %mul_ad.i.i = fmul fast float %6, %1
  %mul_bc.i.i = fmul fast float %8, %0
  %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
  %mul_ac.i.i = fmul fast float %6, %0
  %mul_bd.i.i = fmul fast float %8, %1
  %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
  store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
  store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
  ret void
}

the problem here is not just that the i64 store is unnecessary, but also that it blocks further backend optimizations of the other uses of that i64 value in the backend.

For the value of the interval map, I'm currently using std::tuple<> (because it is empty).

Diff Detail

Repository: rL LLVM

Event Timeline

hfinkel updated this revision to Diff 52008.Mar 29 2016, 5:41 PM

hfinkel retitled this revision from to Allow DeadStoreElimination to track combinations of partial later wrties.

hfinkel updated this object.

hfinkel added reviewers: • dberlin, chandlerc, echristo, igor-laevsky, mcrosier, eeckstein.

hfinkel added a subscriber: llvm-commits.

Herald added a subscriber: mcrosier. · View Herald TranscriptMar 29 2016, 5:41 PM

junbuml added a subscriber: junbuml.Mar 30 2016, 7:14 AM

This is great! This actually fixes a problem case I ran into some months ago in the wild, where a memset wasn't being wiped away despite being fully overwritten by the stores following it. Example shown below:

%struct.foostruct = type {
i32 (i8*, i8**, i32, i8, i8*)*,
i32 (i8*, i8**, i32, i8, i8*)*,
i32 (i8*, i8**, i32, i8, i8*)*,
i32 (i8*, i8**, i32, i8, i8*)*,
void (i8*, i32, i32)*
}
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
declare void @goFunc(%struct.foostruct*)

define void @func()  {
  %bang = alloca %struct.foostruct, align 8
  %1 = bitcast %struct.foostruct* %bang to i8*
  call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 40, i32 8, i1 false)  <------    This is now removed
  %2 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 0
  store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** %2, align 8
  %3 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 1
  store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** %3, align 8
  %4 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 2
  store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** %4, align 8
  %5 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 3
  store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** %5, align 8
  %6 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 4
  store void (i8*, i32, i32)* null, void (i8*, i32, i32)** %6, align 8
  call void @goFunc(%struct.foostruct* %bang)
  ret void
}

Looks reasonable to me, but you may want to wait for additional reviews. Comments/questions inline.

lib/Transforms/Scalar/DeadStoreElimination.cpp
440 ↗	(On Diff #52008)	not even -> never
442 ↗	(On Diff #52008)	Should this be <= rather than <?

hfinkel added inline comments.Mar 31 2016, 9:43 AM

lib/Transforms/Scalar/DeadStoreElimination.cpp
442 ↗	(On Diff #52008)	I don't think so. If LaterOff == EarlierOff + Earlier.Size, then it is starting on the byte after the earlier one ends.

This looks reasonable, assuming creating a bunch of intervalmaps is not
that expensive.

eeckstein added inline comments.Mar 31 2016, 11:13 AM

lib/Transforms/Scalar/DeadStoreElimination.cpp
457 ↗	(On Diff #52008)	Shouldn't this be if (ILI != IM.end() && ILI.stop() < LaterIntEnd) { If yes, then what if the existing interval completely covers the Later-interval? I.e. if ILI.stop() >= LaterIntEnd

hfinkel added inline comments.Mar 31 2016, 12:47 PM

lib/Transforms/Scalar/DeadStoreElimination.cpp
457 ↗	(On Diff #52008)	We want to know if we have an interval that overlaps with the current one. IM.find(LaterIntStart) will return the first interval that ends at or after the current one's start. Next we need to check if the interval starts before the current one's end (to see whether it overlaps the current one, or is purely after it). Checking whether it stops before the current one ends is insufficient because it will miss the case where the interval crosses the current one's end point. I'll improve the comments.

LGTM (but please wait for other reviewers)

lib/Transforms/Scalar/DeadStoreElimination.cpp
457 ↗	(On Diff #52008)	Oh, I see. I misunderstood the find() function.

This revision is now accepted and ready to land.Mar 31 2016, 1:04 PM

Minor comments below about the data structure, but I really wonder whether IntervalMap is the right tool for the job here. It seems awfully heavyweight, although it is clearly very efficient... And it seems to be fighting you on several fronts:

You want a set, not a map.
You can't insert overlapping ranges.
The sparseness isn't likely to be useful as you're mostly interested in fairly small memory regions.

What about using a DenseMap<Instruction *, BitVector> and setting one bit per byte? My thoughts:

No complexity around overlaps.
No extra allocation for 64 byte and smaller regions. Pretty minimal allocation even for larger regions, up to 512 bytes no problem.
Super simple implementation, probably much faster for small regions to just set bits and test them.

The major downside I see is that it doesn't scale gracefully to *large* memory regions like a memset might hit...

What do you think? Maybe set an upper bound on size (512 bytes seems a good bound, keeps the BitVector on a cache line) and use BitVector? Is it worth the complexity of using a BitVector when small and an IntervalMap when large?

lib/Transforms/Scalar/DeadStoreElimination.cpp
348–349 ↗	(On Diff #52008)	Ugh, so you really want an interval set. Is it too hard to build one?
444–448 ↗	(On Diff #52008)	Find first, and assign the iterator on insert? It'd be particularly nice to add a try_emplace to DenseMap so that you can do this with a single lookup. =/

Original Message -----

From: "Chandler Carruth" <chandlerc@gmail.com>
To: hfinkel@anl.gov, echristo@gmail.com, igor@azulsystems.com, mcrosier@codeaurora.org, dberlin@dberlin.org,
eeckstein@apple.com
Cc: jvanadrighem@gmail.com, junbuml@codeaurora.org, llvm-commits@lists.llvm.org
Sent: Thursday, March 31, 2016 8:56:37 PM
Subject: Re: [PATCH] D18586: Allow DeadStoreElimination to track combinations of partial later wrties

chandlerc added a comment.

Minor comments below about the data structure, but I really wonder
whether IntervalMap is the right tool for the job here. It seems
awfully heavyweight, although it is clearly very efficient... And it
seems to be fighting you on several fronts:

You want a set, not a map.

You can't insert overlapping ranges.

The sparseness isn't likely to be useful as you're mostly

interested in fairly small memory regions.

What about using a DenseMap<Instruction *, BitVector> and setting one
bit per byte? My thoughts:

No complexity around overlaps.

No extra allocation for 64 byte and smaller regions. Pretty minimal

allocation even for larger regions, up to 512 bytes no problem.

Super simple implementation, probably much faster for small regions

to just set bits and test them.

The major downside I see is that it doesn't scale gracefully to
*large* memory regions like a memset might hit...

What if I used a SparseBitVector. Perhaps that's the best of both worlds?

-Hal

What do you think? Maybe set an upper bound on size (512 bytes seems
a good bound, keeps the BitVector on a cache line) and use
BitVector? Is it worth the complexity of using a BitVector when
small and an IntervalMap when large?

Comment at: lib/Transforms/Scalar/DeadStoreElimination.cpp:348-349
@@ -337,1 +347,4 @@

+typedef IntervalMap<int64_t, std::tuple<>, 4,
+ IntervalMapHalfOpenInfo<int64_t>>
OverlapIntervalsTy;
+typedef DenseMap<Instruction *, OverlapIntervalsTy>

InstOverlapIntervalsTy;

Ugh, so you really want an interval *set*. Is it too hard to build
one?

Comment at: lib/Transforms/Scalar/DeadStoreElimination.cpp:444-448
@@ +443,7 @@
+ int64_t(LaterOff + Later.Size) >= EarlierOff) {
+ if (!IOL.count(DepWrite))
+ IOL.insert(std::make_pair(DepWrite,
OverlapIntervalsTy(OLAlloc)));
+
+ // Insert our part of the overlap into the map.
+ auto &IM = IOL.find(DepWrite)->second;
+ DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" <<

EarlierOff << ", " <<

Find first, and assign the iterator on insert? It'd be particularly
nice to add a try_emplace to DenseMap so that you can do this with a
single lookup. =/

http://reviews.llvm.org/D18586

In D18586#389096, @hfinkel wrote:

Original Message -----

From: "Chandler Carruth" <chandlerc@gmail.com>
The major downside I see is that it doesn't scale gracefully to
*large* memory regions like a memset might hit...

What if I used a SparseBitVector. Perhaps that's the best of both worlds?

I think not sadly, at least if I'm thinking about this right...

SparseBitVector makes setting a large range or testing if all are set significantly more expensive when there are many bits than a normal BitVector, but with both the cost is linear in the number of bits set where as with IntervalMap it is constant in the size of the interval, and a very nice logarithmic factor in the number of disjoint intervals.

In the worst case for the IntervalMap, we would do N inserts for N stores, and if all were disjoint, the cost would be log(N), so total would be N*log(N).

In the worst case for either BitVector, we would do N inserts of the same M - 1 bits for an M byte size object which would be N * M. So when M becomes large, this will become quite slow. This isn't really helped by it being sparse either as it is sparse in the zero bits, not the set bits.

mcrosier resigned from this revision.May 26 2016, 6:49 AM

mcrosier removed a reviewer: mcrosier.

mcrosier removed a subscriber: mcrosier.

Rebased, and replaced the use of IntervalMap with std::map. The reviewers are certainly right: we gain little by using an IntervalMap here (because IntervalMap does not handle overlapping inserts, so we end up doing the merging ourselves anyway).

In addition to the reviews here, I also discussed this with folks offline, and the conclusion was to go with a map or some general data structure at first. If we see a compile-time hit from this related to small stores, we can always add code to use a bitmap for small stores.

Herald added a subscriber: mehdi_amini. · View Herald TranscriptJun 7 2016, 8:53 PM

Add a test case involving memset provided in the review by JakeVanAdrighem.

Ping (Chandler, are you okay with this now?)

junbuml added inline comments.Jun 16 2016, 11:01 AM

lib/Transforms/Scalar/DeadStoreElimination.cpp
406 ↗	(On Diff #59992)	Don't you need to do : LaterIntEnd = std::max(LaterIntEnd, ILI->first); before erasing ILI in case ILI->first is larger than LaterIntEnd ?
413 ↗	(On Diff #59992)	I think there is no need to assign to ILI here.

hfinkel added inline comments.Jun 16 2016, 11:29 AM

lib/Transforms/Scalar/DeadStoreElimination.cpp
406 ↗	(On Diff #59992)	Thanks for catching that.
413 ↗	(On Diff #59992)	Indeed.

In D18586#451855, @hfinkel wrote:

Rebased, and replaced the use of IntervalMap with std::map. The reviewers are certainly right: we gain little by using an IntervalMap here (because IntervalMap does not handle overlapping inserts, so we end up doing the merging ourselves anyway).

In addition to the reviews here, I also discussed this with folks offline, and the conclusion was to go with a map or some general data structure at first. If we see a compile-time hit from this related to small stores, we can always add code to use a bitmap for small stores.

I think I'm OK with this for now. I like the idea of using a simpler data structure at first. But please add a comment to the data structure that explains both the semantics (especially the surprising {end, start} representation, I had to read a fair amount of code to even see this) and captures a lot of the discussion we've had about tradeoffs around different data structures in case you or I aren't the ones to try to improve this later.

Everything below is further thoughts that might be distilled into comments, not anything I'm asking for right now:

My suspicion is that a sorted vector would work better than a map here. Among other things, it is very easy to have the intervals be represented in a natural way within the vector, and merely use a custom comparison to achieve the sorting behavior you want. Additionally, the overhead for the std::map will be roughly the same as the value size which is always an unfortunate tradeoff.

But as I said I'm happy for you to look at this as a follow-up patch. It shouldn't block the review as the code will be largely the same at this point, and the current version is almost certainly the simplest and most basic approach so it seems a good starting position.

From the discussion, I suspect the eventual end state will be ta use a BitVector for "small" regions, and a sorted vector for "large" regions. But that will definitely be quite complex and so I wouldn't want to go there until we have some useful test cases that show serious compile time hits here.

Thanks again,
-Chandler

Closed by commit rL273559: Allow DeadStoreElimination to track combinations of partial later wrties (authored by hfinkel). · Explain WhyJun 23 2016, 6:53 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

Scalar/

DeadStoreElimination.cpp

75 lines

test/

Transforms/

DeadStoreElimination/

combined-partial-overwrites.ll

200 lines

Diff 61670

llvm/trunk/lib/Transforms/Scalar/DeadStoreElimination.cpp

Show All 10 Lines
// basic-block local redundant stores.		// basic-block local redundant stores.
//		//
// FIXME: This should eventually be extended to be a post-dominator tree		// FIXME: This should eventually be extended to be a post-dominator tree
// traversal. Doing so would be pretty trivial.		// traversal. Doing so would be pretty trivial.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Scalar/DeadStoreElimination.h"		#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
		#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"		#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"		#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"		#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CaptureTracking.h"		#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/GlobalsModRef.h"		#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/MemoryBuiltins.h"		#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"		#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"		#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"		#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"		#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"		#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"		#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
		#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"		#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"		#include "llvm/Transforms/Utils/Local.h"
		#include <map>
using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "dse"		#define DEBUG_TYPE "dse"

STATISTIC(NumRedundantStores, "Number of redundant stores deleted");		STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
STATISTIC(NumFastStores, "Number of stores deleted");		STATISTIC(NumFastStores, "Number of stores deleted");
STATISTIC(NumFastOther , "Number of other instrs removed");		STATISTIC(NumFastOther , "Number of other instrs removed");
		STATISTIC(NumCompletePartials, "Number of stores dead by later partials");

		static cl::opt<bool>
		EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
		cl::init(true), cl::Hidden,
		cl::desc("Enable partial-overwrite tracking in DSE"));


//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Helper functions		// Helper functions
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// Delete this instruction. Before we do, go through and zero out all the		/// Delete this instruction. Before we do, go through and zero out all the
/// operands of this instruction. If any of them become dead, delete them and		/// operands of this instruction. If any of them become dead, delete them and
▲ Show 20 Lines • Show All 211 Lines • ▼ Show 20 Lines
enum OverwriteResult {		enum OverwriteResult {
OverwriteBegin,		OverwriteBegin,
OverwriteComplete,		OverwriteComplete,
OverwriteEnd,		OverwriteEnd,
OverwriteUnknown		OverwriteUnknown
};		};
}		}

		typedef DenseMap<Instruction *,
		std::map<int64_t, int64_t>> InstOverlapIntervalsTy;

/// Return 'OverwriteComplete' if a store to the 'Later' location completely		/// Return 'OverwriteComplete' if a store to the 'Later' location completely
/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of		/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of
/// the 'Earlier' location is completely overwritten by 'Later',		/// the 'Earlier' location is completely overwritten by 'Later',
/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten		/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten
/// by 'Later', or 'OverwriteUnknown' if nothing can be determined.		/// by 'Later', or 'OverwriteUnknown' if nothing can be determined.
static OverwriteResult isOverwrite(const MemoryLocation &Later,		static OverwriteResult isOverwrite(const MemoryLocation &Later,
const MemoryLocation &Earlier,		const MemoryLocation &Earlier,
const DataLayout &DL,		const DataLayout &DL,
const TargetLibraryInfo &TLI,		const TargetLibraryInfo &TLI,
int64_t &EarlierOff, int64_t &LaterOff) {		int64_t &EarlierOff, int64_t &LaterOff,
		Instruction *DepWrite,
		InstOverlapIntervalsTy &IOL) {
// If we don't know the sizes of either access, then we can't do a comparison.		// If we don't know the sizes of either access, then we can't do a comparison.
if (Later.Size == MemoryLocation::UnknownSize \|\|		if (Later.Size == MemoryLocation::UnknownSize \|\|
Earlier.Size == MemoryLocation::UnknownSize)		Earlier.Size == MemoryLocation::UnknownSize)
return OverwriteUnknown;		return OverwriteUnknown;

const Value *P1 = Earlier.Ptr->stripPointerCasts();		const Value *P1 = Earlier.Ptr->stripPointerCasts();
const Value *P2 = Later.Ptr->stripPointerCasts();		const Value *P2 = Later.Ptr->stripPointerCasts();

▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines	static OverwriteResult isOverwrite(const MemoryLocation &Later,
// \|----- later ------\|		// \|----- later ------\|
//		//
// We have to be careful here as Off is signed while .Size is unsigned.		// We have to be careful here as Off is signed while .Size is unsigned.
if (EarlierOff >= LaterOff &&		if (EarlierOff >= LaterOff &&
Later.Size >= Earlier.Size &&		Later.Size >= Earlier.Size &&
uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)		uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
return OverwriteComplete;		return OverwriteComplete;

		// We may now overlap, although the overlap is not complete. There might also
		// be other incomplete overlaps, and together, they might cover the complete
		// earlier write.
		// Note: The correctness of this logic depends on the fact that this function
		// is not even called providing DepWrite when there are any intervening reads.
		if (EnablePartialOverwriteTracking &&
		LaterOff < int64_t(EarlierOff + Earlier.Size) &&
		int64_t(LaterOff + Later.Size) >= EarlierOff) {

		// Insert our part of the overlap into the map.
		auto &IM = IOL[DepWrite];
		DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " <<
		int64_t(EarlierOff + Earlier.Size) << ") Later [" <<
		LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n");

		// Make sure that we only insert non-overlapping intervals and combine
		// adjacent intervals. The intervals are stored in the map with the ending
		// offset as the key (in the half-open sense) and the starting offset as
		// the value.
		int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + Later.Size;

		// Find any intervals ending at, or after, LaterIntStart which start
		// before LaterIntEnd.
		auto ILI = IM.lower_bound(LaterIntStart);
		if (ILI != IM.end() && ILI->second < LaterIntEnd) {
		// This existing interval ends in the middle of
		// [LaterIntStart, LaterIntEnd), erase it adjusting our start.
		LaterIntStart = std::min(LaterIntStart, ILI->second);
		LaterIntEnd = std::max(LaterIntEnd, ILI->first);
		ILI = IM.erase(ILI);

		while (ILI != IM.end() && ILI->first <= LaterIntEnd)
		ILI = IM.erase(ILI);

		if (ILI != IM.end() && ILI->second < LaterIntEnd)
		LaterIntEnd = std::max(LaterIntEnd, ILI->first);
		}

		IM[LaterIntEnd] = LaterIntStart;

		ILI = IM.begin();
		if (ILI->second <= EarlierOff &&
		ILI->first >= int64_t(EarlierOff + Earlier.Size)) {
		DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" <<
		EarlierOff << ", " <<
		int64_t(EarlierOff + Earlier.Size) <<
		") Composite Later [" <<
		ILI->second << ", " << ILI->first << ")\n");
		++NumCompletePartials;
		return OverwriteComplete;
		}
		}

// Another interesting case is if the later store overwrites the end of the		// Another interesting case is if the later store overwrites the end of the
// earlier store.		// earlier store.
//		//
// \|--earlier--\|		// \|--earlier--\|
// \|-- later --\|		// \|-- later --\|
//		//
// In this case we may want to trim the size of earlier to avoid generating		// In this case we may want to trim the size of earlier to avoid generating
// writes to addresses which will definitely be overwritten later		// writes to addresses which will definitely be overwritten later
▲ Show 20 Lines • Show All 374 Lines • ▼ Show 20 Lines
}		}

static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,		static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
MemoryDependenceResults MD, DominatorTree DT,		MemoryDependenceResults MD, DominatorTree DT,
const TargetLibraryInfo *TLI) {		const TargetLibraryInfo *TLI) {
const DataLayout &DL = BB.getModule()->getDataLayout();		const DataLayout &DL = BB.getModule()->getDataLayout();
bool MadeChange = false;		bool MadeChange = false;

		// A map of interval maps representing partially-overwritten value parts.
		InstOverlapIntervalsTy IOL;

// Do a top-down walk on the BB.		// Do a top-down walk on the BB.
for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {		for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
Instruction Inst = &BBI++;		Instruction Inst = &BBI++;

// Handle 'free' calls specially.		// Handle 'free' calls specially.
if (CallInst *F = isFreeCall(Inst, TLI)) {		if (CallInst *F = isFreeCall(Inst, TLI)) {
MadeChange \|= handleFree(F, AA, MD, DT, TLI);		MadeChange \|= handleFree(F, AA, MD, DT, TLI);
continue;		continue;
▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines	while (InstDep.isDef() \|\| InstDep.isClobber()) {

// If we find a write that is a) removable (i.e., non-volatile), b) is		// If we find a write that is a) removable (i.e., non-volatile), b) is
// completely obliterated by the store to 'Loc', and c) which we know that		// completely obliterated by the store to 'Loc', and c) which we know that
// 'Inst' doesn't load from, then we can remove it.		// 'Inst' doesn't load from, then we can remove it.
if (isRemovable(DepWrite) &&		if (isRemovable(DepWrite) &&
!isPossibleSelfRead(Inst, Loc, DepWrite, TLI, AA)) {		!isPossibleSelfRead(Inst, Loc, DepWrite, TLI, AA)) {
int64_t InstWriteOffset, DepWriteOffset;		int64_t InstWriteOffset, DepWriteOffset;
OverwriteResult OR =		OverwriteResult OR =
isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);		isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
		DepWrite, IOL);
if (OR == OverwriteComplete) {		if (OR == OverwriteComplete) {
DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: "		DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: "
<< DepWrite << "\n KILLER: " << Inst << '\n');		<< DepWrite << "\n KILLER: " << Inst << '\n');

// Delete the store and now-dead instructions that feed it.		// Delete the store and now-dead instructions that feed it.
deleteDeadInstruction(DepWrite, MD, TLI);		deleteDeadInstruction(DepWrite, MD, TLI);
++NumFastStores;		++NumFastStores;
MadeChange = true;		MadeChange = true;
▲ Show 20 Lines • Show All 162 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll

				; RUN: opt -S -dse < %s \| FileCheck %s
				target datalayout = "E-m:e-i64:64-n32:64"
				target triple = "powerpc64-bgq-linux"

				%"struct.std::complex" = type { { float, float } }

				define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
				entry:
				; CHECK-LABEL: @_Z4testSt7complexIfE

				%ref.tmp = alloca i64, align 8
				%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
				%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
				%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
				%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
				%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
				%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
				call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
				%2 = bitcast %"struct.std::complex"* %agg.result to i64*
				%3 = load i64, i64* %ref.tmp, align 8
				store i64 %3, i64* %2, align 4
				; CHECK-NOT: store i64

				%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
				%4 = lshr i64 %3, 32
				%5 = trunc i64 %4 to i32
				%6 = bitcast i32 %5 to float
				%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
				%7 = trunc i64 %3 to i32
				%8 = bitcast i32 %7 to float
				%mul_ad.i.i = fmul fast float %6, %1
				%mul_bc.i.i = fmul fast float %8, %0
				%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
				%mul_ac.i.i = fmul fast float %6, %0
				%mul_bd.i.i = fmul fast float %8, %1
				%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
				store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
				store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
				ret void
				; CHECK: ret void
				}

				declare void @_Z3barSt7complexIfE(%"struct.std::complex"* sret, i64)

				define void @test1(i32 *%ptr) {
				entry:
				; CHECK-LABEL: @test1

				store i32 5, i32* %ptr
				%bptr = bitcast i32* %ptr to i8*
				store i8 7, i8* %bptr
				%wptr = bitcast i32* %ptr to i16*
				store i16 -30062, i16* %wptr
				%bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2
				store i8 25, i8* %bptr2
				%bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3
				store i8 47, i8* %bptr3
				%bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1
				%wptrp = bitcast i8* %bptr1 to i16*
				store i16 2020, i16* %wptrp, align 1
				ret void

				; CHECK-NOT: store i32 5, i32* %ptr
				; CHECK-NOT: store i8 7, i8* %bptr
				; CHECK: store i16 -30062, i16* %wptr
				; CHECK-NOT: store i8 25, i8* %bptr2
				; CHECK: store i8 47, i8* %bptr3
				; CHECK: store i16 2020, i16* %wptrp, align 1

				; CHECK: ret void
				}

				define void @test2(i32 *%ptr) {
				entry:
				; CHECK-LABEL: @test2

				store i32 5, i32* %ptr

				%bptr = bitcast i32* %ptr to i8*
				%bptrm1 = getelementptr inbounds i8, i8* %bptr, i64 -1
				%bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1
				%bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2
				%bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3

				%wptr = bitcast i8* %bptr to i16*
				%wptrm1 = bitcast i8* %bptrm1 to i16*
				%wptr1 = bitcast i8* %bptr1 to i16*
				%wptr2 = bitcast i8* %bptr2 to i16*
				%wptr3 = bitcast i8* %bptr3 to i16*

				store i16 1456, i16* %wptrm1, align 1
				store i16 1346, i16* %wptr, align 1
				store i16 1756, i16* %wptr1, align 1
				store i16 1126, i16* %wptr2, align 1
				store i16 5656, i16* %wptr3, align 1

				; CHECK-NOT: store i32 5, i32* %ptr

				; CHECK: store i16 1456, i16* %wptrm1, align 1
				; CHECK: store i16 1346, i16* %wptr, align 1
				; CHECK: store i16 1756, i16* %wptr1, align 1
				; CHECK: store i16 1126, i16* %wptr2, align 1
				; CHECK: store i16 5656, i16* %wptr3, align 1

				ret void

				; CHECK: ret void
				}

				define signext i8 @test3(i32 *%ptr) {
				entry:
				; CHECK-LABEL: @test3

				store i32 5, i32* %ptr

				%bptr = bitcast i32* %ptr to i8*
				%bptrm1 = getelementptr inbounds i8, i8* %bptr, i64 -1
				%bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1
				%bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2
				%bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3

				%wptr = bitcast i8* %bptr to i16*
				%wptrm1 = bitcast i8* %bptrm1 to i16*
				%wptr1 = bitcast i8* %bptr1 to i16*
				%wptr2 = bitcast i8* %bptr2 to i16*
				%wptr3 = bitcast i8* %bptr3 to i16*

				%v = load i8, i8* %bptr, align 1
				store i16 1456, i16* %wptrm1, align 1
				store i16 1346, i16* %wptr, align 1
				store i16 1756, i16* %wptr1, align 1
				store i16 1126, i16* %wptr2, align 1
				store i16 5656, i16* %wptr3, align 1

				; CHECK: store i32 5, i32* %ptr

				ret i8 %v

				; CHECK: ret i8 %v
				}

				%struct.foostruct = type {
				i32 (i8, i8, i32, i8, i8)*,
				i32 (i8, i8, i32, i8, i8)*,
				i32 (i8, i8, i32, i8, i8)*,
				i32 (i8, i8, i32, i8, i8)*,
				void (i8, i32, i32)
				}
				declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
				declare void @goFunc(%struct.foostruct*)
				declare i32 @fa(i8, i8, i32, i8, i8)

				define void @test4() {
				entry:
				; CHECK-LABEL: @test4

				%bang = alloca %struct.foostruct, align 8
				%v1 = bitcast %struct.foostruct* %bang to i8*
				call void @llvm.memset.p0i8.i64(i8* %v1, i8 0, i64 40, i32 8, i1 false)
				%v2 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 0
				store i32 (i8, i8, i32, i8, i8)* @fa, i32 (i8, i8, i32, i8, i8)** %v2, align 8
				%v3 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 1
				store i32 (i8, i8, i32, i8, i8)* @fa, i32 (i8, i8, i32, i8, i8)** %v3, align 8
				%v4 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 2
				store i32 (i8, i8, i32, i8, i8)* @fa, i32 (i8, i8, i32, i8, i8)** %v4, align 8
				%v5 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 3
				store i32 (i8, i8, i32, i8, i8)* @fa, i32 (i8, i8, i32, i8, i8)** %v5, align 8
				%v6 = getelementptr inbounds %struct.foostruct, %struct.foostruct* %bang, i64 0, i32 4
				store void (i8, i32, i32) null, void (i8, i32, i32)* %v6, align 8
				call void @goFunc(%struct.foostruct* %bang)
				ret void

				; CHECK-NOT: memset
				; CHECK: ret void
				}

				define signext i8 @test5(i32 *%ptr) {
				entry:
				; CHECK-LABEL: @test5

				store i32 0, i32* %ptr

				%bptr = bitcast i32* %ptr to i8*
				%bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1
				%bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2
				%bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3

				%wptr = bitcast i8* %bptr to i16*
				%wptr1 = bitcast i8* %bptr1 to i16*
				%wptr2 = bitcast i8* %bptr2 to i16*

				store i16 65535, i16* %wptr2, align 1
				store i16 1456, i16* %wptr1, align 1
				store i16 1346, i16* %wptr, align 1

				; CHECK-NOT: store i32 0, i32* %ptr

				ret i8 0
				}