This is an archive of the discontinued LLVM Phabricator instance.

[TTI] Fix default costs for interleaved accesses
ClosedPublic

Authored by sbaranga on Aug 3 2015, 5:43 AM.

Download Raw Diff

Details

Reviewers

Commits

rG1becccb10f8a: [TTI] Fix default costs for interleaved accesses
rL243875: [TTI] Fix default costs for interleaved accesses

Summary

Modify the cost calculation function for interleaved accesses
to use the target-specific costs for insert/extract element and
memory operations.

This better models the case where the backend can't match
the interleaved group, and we are forced to use a wide load
and shuffle vectors.

Interleaved accesses are not enabled by default, so this shouldn't
cause a performance change.

Diff Detail

Event Timeline

sbaranga updated this revision to Diff 31226.Aug 3 2015, 5:43 AM

sbaranga retitled this revision from to [TTI] Fix default costs for interleaved accesses.

sbaranga updated this object.

sbaranga added a subscriber: llvm-commits.

No tests were added. It seems any tests for this would be very easy to break.

This looks like a mechanical change to me. I don't like the static_casts all over the place, but it's used prolifically elsewhere in the file so I can't see an issue with it.

James

This revision is now accepted and ready to land.Aug 3 2015, 6:42 AM

sbaranga closed this revision.Aug 3 2015, 7:01 AM

Thanks! Committed in 243875.

rengolin added a subscriber: rengolin.Aug 3 2015, 7:02 AM

rengolin added inline comments.

include/llvm/CodeGen/BasicTTIImpl.h
537	nitpick: can't you cache the T * to avoid re-using the ugly static_cast?

sbaranga added inline comments.Aug 3 2015, 7:05 AM

include/llvm/CodeGen/BasicTTIImpl.h
537	Perhaps, but using the static_casts seems to be the code style here.

Revision Contents

Path

Size

include/

llvm/

CodeGen/

BasicTTIImpl.h

20 lines

Diff 31226

include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 528 Lines • ▼ Show 20 Lines	unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,

unsigned NumElts = VT->getNumElements();		unsigned NumElts = VT->getNumElements();
assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");		assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");

unsigned NumSubElts = NumElts / Factor;		unsigned NumSubElts = NumElts / Factor;
VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);		VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);

// Firstly, the cost of load/store operation.		// Firstly, the cost of load/store operation.
unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);		unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
		rengolinUnsubmitted Not Done Reply Inline Actions nitpick: can't you cache the T * to avoid re-using the ugly static_cast? rengolin: nitpick: can't you cache the T * to avoid re-using the ugly static_cast?
		sbarangaAuthorUnsubmitted Not Done Reply Inline Actions Perhaps, but using the static_casts seems to be the code style here. sbaranga: Perhaps, but using the static_casts seems to be the code style here.
		Opcode, VecTy, Alignment, AddressSpace);

// Then plus the cost of interleave operation.		// Then plus the cost of interleave operation.
if (Opcode == Instruction::Load) {		if (Opcode == Instruction::Load) {
// The interleave cost is similar to extract sub vectors' elements		// The interleave cost is similar to extract sub vectors' elements
// from the wide vector, and insert them into sub vectors.		// from the wide vector, and insert them into sub vectors.
//		//
// E.g. An interleaved load of factor 2 (with one member of index 0):		// E.g. An interleaved load of factor 2 (with one member of index 0):
// %vec = load <8 x i32>, <8 x i32>* %ptr		// %vec = load <8 x i32>, <8 x i32>* %ptr
// %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0		// %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
// The cost is estimated as extract elements at 0, 2, 4, 6 from the		// The cost is estimated as extract elements at 0, 2, 4, 6 from the
// <8 x i32> vector and insert them into a <4 x i32> vector.		// <8 x i32> vector and insert them into a <4 x i32> vector.

assert(Indices.size() <= Factor &&		assert(Indices.size() <= Factor &&
"Interleaved memory op has too many members");		"Interleaved memory op has too many members");

for (unsigned Index : Indices) {		for (unsigned Index : Indices) {
assert(Index < Factor && "Invalid index for interleaved memory op");		assert(Index < Factor && "Invalid index for interleaved memory op");

// Extract elements from loaded vector for each sub vector.		// Extract elements from loaded vector for each sub vector.
for (unsigned i = 0; i < NumSubElts; i++)		for (unsigned i = 0; i < NumSubElts; i++)
Cost += getVectorInstrCost(Instruction::ExtractElement, VT,		Cost += static_cast<T *>(this)->getVectorInstrCost(
Index + i * Factor);		Instruction::ExtractElement, VT, Index + i * Factor);
}		}

unsigned InsSubCost = 0;		unsigned InsSubCost = 0;
for (unsigned i = 0; i < NumSubElts; i++)		for (unsigned i = 0; i < NumSubElts; i++)
InsSubCost += getVectorInstrCost(Instruction::InsertElement, SubVT, i);		InsSubCost += static_cast<T *>(this)->getVectorInstrCost(
		Instruction::InsertElement, SubVT, i);

Cost += Indices.size() * InsSubCost;		Cost += Indices.size() * InsSubCost;
} else {		} else {
// The interleave cost is extract all elements from sub vectors, and		// The interleave cost is extract all elements from sub vectors, and
// insert them into the wide vector.		// insert them into the wide vector.
//		//
// E.g. An interleaved store of factor 2:		// E.g. An interleaved store of factor 2:
// %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>		// %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
// store <8 x i32> %interleaved.vec, <8 x i32>* %ptr		// store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
// The cost is estimated as extract all elements from both <4 x i32>		// The cost is estimated as extract all elements from both <4 x i32>
// vectors and insert into the <8 x i32> vector.		// vectors and insert into the <8 x i32> vector.

unsigned ExtSubCost = 0;		unsigned ExtSubCost = 0;
for (unsigned i = 0; i < NumSubElts; i++)		for (unsigned i = 0; i < NumSubElts; i++)
ExtSubCost += getVectorInstrCost(Instruction::ExtractElement, SubVT, i);		ExtSubCost += static_cast<T *>(this)->getVectorInstrCost(
		Instruction::ExtractElement, SubVT, i);
Cost += Factor * ExtSubCost;		Cost += ExtSubCost * Factor;

for (unsigned i = 0; i < NumElts; i++)		for (unsigned i = 0; i < NumElts; i++)
Cost += getVectorInstrCost(Instruction::InsertElement, VT, i);		Cost += static_cast<T *>(this)
		->getVectorInstrCost(Instruction::InsertElement, VT, i);
}		}

return Cost;		return Cost;
}		}

unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,		unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys) {		ArrayRef<Type *> Tys) {
unsigned ISD = 0;		unsigned ISD = 0;
▲ Show 20 Lines • Show All 228 Lines • Show Last 20 Lines