This is an archive of the discontinued LLVM Phabricator instance.

[LSV] Insert stores at the right point.
ClosedPublic

Authored by jlebar on Jul 19 2016, 2:18 PM.

Download Raw Diff

Details

Reviewers

asbirlea
• tstellarAMD

Commits

rG8778c626297c: [LSV] Insert stores at the right point.
rL276056: [LSV] Insert stores at the right point.

Summary

Previously, the insertion point for stores was the last instruction in
Chain *before calling getVectorizablePrefixEndIdx*. Thus if
getVectorizablePrefixEndIdx didn't return Chain.size(), we still would
insert at the last instruction in Chain.

This patch changes our internal API a bit in an attempt to make it less
prone to this sort of error. As a result, we end up recalculating the
Chain's boundary instructions, but I think worrying about the speed hit
of this is a premature optimization right now.

Diff Detail

Event Timeline

jlebar updated this revision to Diff 64563.Jul 19 2016, 2:18 PM

jlebar retitled this revision from to [LSV] Insert stores at the right point..

jlebar updated this object.

jlebar added a reviewer: asbirlea.

jlebar added subscribers: llvm-commits, arsenm.

Herald added a reviewer: • tstellarAMD. · View Herald TranscriptJul 19 2016, 2:18 PM

Herald added a subscriber: mzolotukhin. · View Herald Transcript

asbirlea added inline comments.Jul 19 2016, 3:23 PM

lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
772	Can you add a FIXME to mark the possible performance penalty for calling this again? Not a concern right now, but it would be good to keep a record of it.

I like how this refactoring makes the bug resolved by the next patch more obvious.

This revision is now accepted and ready to land.Jul 19 2016, 3:27 PM

Update comment.

Closed by commit rL276056: [LSV] Insert stores at the right point. (authored by jlebar). · Explain WhyJul 19 2016, 4:26 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Transforms/

Vectorize/

LoadStoreVectorizer.cpp

55 lines

test/

Transforms/

LoadStoreVectorizer/

AMDGPU/

insertion-point.ll

31 lines

Diff 64563

lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Show First 20 Lines • Show All 98 Lines • ▼ Show 20 Lines	private:

/// "Legalize" the vector type that would be produced by combining \p		/// "Legalize" the vector type that would be produced by combining \p
/// ElementSizeBits elements in \p Chain. Break into two pieces such that the		/// ElementSizeBits elements in \p Chain. Break into two pieces such that the
/// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is		/// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
/// expected to have more than 4 elements.		/// expected to have more than 4 elements.
std::pair<ArrayRef<Value >, ArrayRef<Value >>		std::pair<ArrayRef<Value >, ArrayRef<Value >>
splitOddVectorElts(ArrayRef<Value *> Chain, unsigned ElementSizeBits);		splitOddVectorElts(ArrayRef<Value *> Chain, unsigned ElementSizeBits);

/// Checks for instructions which may affect the memory accessed		/// Finds the largest prefix of Chain that's vectorizable, checking for
/// in the chain between \p From and \p To. Returns Index, where		/// intervening instructions which may affect the memory accessed by the
/// \p Chain[0, Index) is the largest vectorizable chain prefix.		/// instructions within Chain.
/// The elements of \p Chain should be all loads or all stores.		///
unsigned getVectorizablePrefixEndIdx(ArrayRef<Value *> Chain,		/// The elements of \p Chain must be all loads or all stores.
BasicBlock::iterator From,		ArrayRef<Value > getVectorizablePrefix(ArrayRef<Value > Chain);
BasicBlock::iterator To);

/// Collects load and store instructions to vectorize.		/// Collects load and store instructions to vectorize.
std::pair<ValueListMap, ValueListMap> collectInstructions(BasicBlock *BB);		std::pair<ValueListMap, ValueListMap> collectInstructions(BasicBlock *BB);

/// Processes the collected instructions, the \p Map. The elements of \p Map		/// Processes the collected instructions, the \p Map. The elements of \p Map
/// should be all loads or all stores.		/// should be all loads or all stores.
bool vectorizeChains(ValueListMap &Map);		bool vectorizeChains(ValueListMap &Map);

▲ Show 20 Lines • Show All 297 Lines • ▼ Show 20 Lines	Vectorizer::splitOddVectorElts(ArrayRef<Value *> Chain,
unsigned ElementSizeBits) {		unsigned ElementSizeBits) {
unsigned ElemSizeInBytes = ElementSizeBits / 8;		unsigned ElemSizeInBytes = ElementSizeBits / 8;
unsigned SizeInBytes = ElemSizeInBytes * Chain.size();		unsigned SizeInBytes = ElemSizeInBytes * Chain.size();
unsigned NumRight = (SizeInBytes % 4) / ElemSizeInBytes;		unsigned NumRight = (SizeInBytes % 4) / ElemSizeInBytes;
unsigned NumLeft = Chain.size() - NumRight;		unsigned NumLeft = Chain.size() - NumRight;
return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));		return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
}		}

unsigned Vectorizer::getVectorizablePrefixEndIdx(ArrayRef<Value *> Chain,		ArrayRef<Value > Vectorizer::getVectorizablePrefix(ArrayRef<Value > Chain) {
BasicBlock::iterator From,
BasicBlock::iterator To) {
SmallVector<std::pair<Value *, unsigned>, 16> MemoryInstrs;		SmallVector<std::pair<Value *, unsigned>, 16> MemoryInstrs;
SmallVector<std::pair<Value *, unsigned>, 16> ChainInstrs;		SmallVector<std::pair<Value *, unsigned>, 16> ChainInstrs;

unsigned InstrIdx = 0;		unsigned InstrIdx = 0;
for (Instruction &I : make_range(From, To)) {		for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
++InstrIdx;		++InstrIdx;
if (isa<LoadInst>(I) \|\| isa<StoreInst>(I)) {		if (isa<LoadInst>(I) \|\| isa<StoreInst>(I)) {
if (!is_contained(Chain, &I))		if (!is_contained(Chain, &I))
MemoryInstrs.push_back({&I, InstrIdx});		MemoryInstrs.push_back({&I, InstrIdx});
else		else
ChainInstrs.push_back({&I, InstrIdx});		ChainInstrs.push_back({&I, InstrIdx});
} else if (I.mayHaveSideEffects()) {		} else if (I.mayHaveSideEffects()) {
DEBUG(dbgs() << "LSV: Found side-effecting operation: " << I << '\n');		DEBUG(dbgs() << "LSV: Found side-effecting operation: " << I << '\n');
return 0;		return 0;
}		}
}		}

assert(Chain.size() == ChainInstrs.size() &&		assert(Chain.size() == ChainInstrs.size() &&
"All instructions in the Chain must exist in [From, To).");		"All instrs in Chain must be within range getBoundaryInstrs(Chain).");

unsigned ChainIdx = 0;		unsigned ChainIdx = 0;
for (auto EntryChain : ChainInstrs) {		for (auto EntryChain : ChainInstrs) {
Value *ChainInstrValue = EntryChain.first;		Value *ChainInstrValue = EntryChain.first;
unsigned ChainInstrIdx = EntryChain.second;		unsigned ChainInstrIdx = EntryChain.second;
for (auto EntryMem : MemoryInstrs) {		for (auto EntryMem : MemoryInstrs) {
Value *MemInstrValue = EntryMem.first;		Value *MemInstrValue = EntryMem.first;
unsigned MemInstrIdx = EntryMem.second;		unsigned MemInstrIdx = EntryMem.second;
Show All 25 Lines	for (auto EntryMem : MemoryInstrs) {
"Aliasing instruction and pointer:\n"		"Aliasing instruction and pointer:\n"
<< *MemInstrValue << '\n'		<< *MemInstrValue << '\n'
<< *Ptr0 << '\n'		<< *Ptr0 << '\n'
<< "Aliased instruction and pointer:\n"		<< "Aliased instruction and pointer:\n"
<< *ChainInstrValue << '\n'		<< *ChainInstrValue << '\n'
<< *Ptr1 << '\n';		<< *Ptr1 << '\n';
});		});

return ChainIdx;		return Chain.slice(0, ChainIdx);
}		}
}		}
ChainIdx++;		ChainIdx++;
}		}
return Chain.size();		return Chain;
}		}

std::pair<ValueListMap, ValueListMap>		std::pair<ValueListMap, ValueListMap>
Vectorizer::collectInstructions(BasicBlock *BB) {		Vectorizer::collectInstructions(BasicBlock *BB) {
ValueListMap LoadRefs;		ValueListMap LoadRefs;
ValueListMap StoreRefs;		ValueListMap StoreRefs;

for (Instruction &I : *BB) {		for (Instruction &I : *BB) {
▲ Show 20 Lines • Show All 185 Lines • ▼ Show 20 Lines	bool Vectorizer::vectorizeStoreChain(
unsigned VF = VecRegSize / Sz;		unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();		unsigned ChainSize = Chain.size();

if (!isPowerOf2_32(Sz) \|\| VF < 2 \|\| ChainSize < 2) {		if (!isPowerOf2_32(Sz) \|\| VF < 2 \|\| ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());		InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;		return false;
}		}

BasicBlock::iterator First, Last;		ArrayRef<Value *> NewChain = getVectorizablePrefix(Chain);
std::tie(First, Last) = getBoundaryInstrs(Chain);		if (NewChain.empty()) {
unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last);
if (StopChain == 0) {
// There exists a side effect instruction, no vectorization possible.		// There exists a side effect instruction, no vectorization possible.
InstructionsProcessed->insert(Chain.begin(), Chain.end());		InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;		return false;
}		}
if (StopChain == 1) {		if (NewChain.size() == 1) {
// Failed after the first instruction. Discard it and try the smaller chain.		// Failed after the first instruction. Discard it and try the smaller chain.
InstructionsProcessed->insert(Chain.front());		InstructionsProcessed->insert(NewChain.front());
return false;		return false;
}		}

// Update Chain to the valid vectorizable subchain.		// Update Chain to the valid vectorizable subchain.
Chain = Chain.slice(0, StopChain);		Chain = NewChain;
ChainSize = Chain.size();		ChainSize = Chain.size();

// Store size should be 1B, 2B or multiple of 4B.		// Store size should be 1B, 2B or multiple of 4B.
// TODO: Target hook for size constraint?		// TODO: Target hook for size constraint?
unsigned SzInBytes = (Sz / 8) * ChainSize;		unsigned SzInBytes = (Sz / 8) * ChainSize;
if (SzInBytes > 2 && SzInBytes % 4 != 0) {		if (SzInBytes > 2 && SzInBytes % 4 != 0) {
DEBUG(dbgs() << "LSV: Size should be 1B, 2B "		DEBUG(dbgs() << "LSV: Size should be 1B, 2B "
"or multiple of 4B. Splitting.\n");		"or multiple of 4B. Splitting.\n");
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {		if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {
AI->setAlignment(TargetBaseAlign);		AI->setAlignment(TargetBaseAlign);
Alignment = TargetBaseAlign;		Alignment = TargetBaseAlign;
} else {		} else {
return false;		return false;
}		}
}		}

// Set insert point.		BasicBlock::iterator First, Last;
		std::tie(First, Last) = getBoundaryInstrs(Chain);
		asbirleaUnsubmitted Done Reply Inline Actions Can you add a FIXME to mark the possible performance penalty for calling this again? Not a concern right now, but it would be good to keep a record of it. asbirlea: Can you add a FIXME to mark the possible performance penalty for calling this again? Not a…
Builder.SetInsertPoint(&*Last);		Builder.SetInsertPoint(&*Last);

Value *Vec = UndefValue::get(VecTy);		Value *Vec = UndefValue::get(VecTy);

if (VecStoreTy) {		if (VecStoreTy) {
unsigned VecWidth = VecStoreTy->getNumElements();		unsigned VecWidth = VecStoreTy->getNumElements();
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {		for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
StoreInst *Store = cast<StoreInst>(Chain[I]);		StoreInst *Store = cast<StoreInst>(Chain[I]);
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	bool Vectorizer::vectorizeLoadChain(
unsigned VF = VecRegSize / Sz;		unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();		unsigned ChainSize = Chain.size();

if (!isPowerOf2_32(Sz) \|\| VF < 2 \|\| ChainSize < 2) {		if (!isPowerOf2_32(Sz) \|\| VF < 2 \|\| ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());		InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;		return false;
}		}

BasicBlock::iterator First, Last;		ArrayRef<Value *> NewChain = getVectorizablePrefix(Chain);
std::tie(First, Last) = getBoundaryInstrs(Chain);		if (NewChain.empty()) {
unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last);
if (StopChain == 0) {
// There exists a side effect instruction, no vectorization possible.		// There exists a side effect instruction, no vectorization possible.
InstructionsProcessed->insert(Chain.begin(), Chain.end());		InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;		return false;
}		}
if (StopChain == 1) {		if (NewChain.size() == 1) {
// Failed after the first instruction. Discard it and try the smaller chain.		// Failed after the first instruction. Discard it and try the smaller chain.
InstructionsProcessed->insert(Chain.front());		InstructionsProcessed->insert(NewChain.front());
return false;		return false;
}		}

// Update Chain to the valid vectorizable subchain.		// Update Chain to the valid vectorizable subchain.
Chain = Chain.slice(0, StopChain);		Chain = NewChain;
ChainSize = Chain.size();		ChainSize = Chain.size();

// Load size should be 1B, 2B or multiple of 4B.		// Load size should be 1B, 2B or multiple of 4B.
// TODO: Should size constraint be a target hook?		// TODO: Should size constraint be a target hook?
unsigned SzInBytes = (Sz / 8) * ChainSize;		unsigned SzInBytes = (Sz / 8) * ChainSize;
if (SzInBytes > 2 && SzInBytes % 4 != 0) {		if (SzInBytes > 2 && SzInBytes % 4 != 0) {
DEBUG(dbgs() << "LSV: Size should be 1B, 2B "		DEBUG(dbgs() << "LSV: Size should be 1B, 2B "
"or multiple of 4B. Splitting.\n");		"or multiple of 4B. Splitting.\n");
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	bool Vectorizer::vectorizeLoadChain(
}		}

DEBUG({		DEBUG({
dbgs() << "LSV: Loads to vectorize:\n";		dbgs() << "LSV: Loads to vectorize:\n";
for (Value *V : Chain)		for (Value *V : Chain)
V->dump();		V->dump();
});		});

// Set insert point.		BasicBlock::iterator First, Last;
		std::tie(First, Last) = getBoundaryInstrs(Chain);
Builder.SetInsertPoint(&*First);		Builder.SetInsertPoint(&*First);

Value *Bitcast =		Value *Bitcast =
Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));		Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));

LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));		LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
propagateMetadata(LI, Chain);		propagateMetadata(LI, Chain);
LI->setAlignment(Alignment);		LI->setAlignment(Alignment);
▲ Show 20 Lines • Show All 66 Lines • Show Last 20 Lines

test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll

; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s \| FileCheck %s		; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s \| FileCheck %s

target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"		target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

; Check relative position of the inserted vector load relative to the existing		; Check position of the inserted vector load/store. Vectorized loads should be
; adds. Vectorized loads should be inserted at the position of the first load.		; inserted at the position of the first load in the chain, and stores should be
		; inserted at the position of the last store.

; CHECK-LABEL: @insert_load_point(		; CHECK-LABEL: @insert_load_point(
; CHECK: %z = add i32 %x, 4		; CHECK: %z = add i32 %x, 4
; CHECK: load <2 x float>		; CHECK: load <2 x float>
; CHECK: %w = add i32 %y, 9		; CHECK: %w = add i32 %y, 9
; CHECK: %foo = add i32 %z, %w		; CHECK: %foo = add i32 %z, %w
define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {		define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
entry:		entry:
Show All 39 Lines	entry:
%foo = add i32 %z, %w		%foo = add i32 %z, %w

%add = fadd float %ld.c, %ld.c.idx.1		%add = fadd float %ld.c, %ld.c.idx.1
store float %add, float addrspace(1)* %b, align 4		store float %add, float addrspace(1)* %b, align 4
store i32 %foo, i32 addrspace(3)* null, align 4		store i32 %foo, i32 addrspace(3)* null, align 4
ret void		ret void
}		}

		; Here we have four stores, with an aliasing load before the last one. We can
		; vectorize the first two stores as <2 x float>, but this vectorized store must
		; be inserted at the location of the second scalar store, not the fourth one.
		;
		; CHECK-LABEL: @insert_store_point_alias
		; CHECK: store <2 x float>
		; CHECK: store float
		; CHECK-SAME: %a.idx.2
		; CHECK: load float, float addrspace(1)* %a.idx.2
		; CHECK: store float
		; CHECK-SAME: %a.idx.3
		define float @insert_store_point_alias(float addrspace(1)* nocapture %a, i64 %idx) {
		%a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
		%a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1
		%a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1
		%a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1

		store float 0.0, float addrspace(1)* %a.idx, align 4
		store float 0.0, float addrspace(1)* %a.idx.1, align 4
		store float 0.0, float addrspace(1)* %a.idx.2, align 4
		%x = load float, float addrspace(1)* %a.idx.2, align 4
		store float 0.0, float addrspace(1)* %a.idx.3, align 4

		ret float %x
		}

attributes #0 = { nounwind }		attributes #0 = { nounwind }