Diff 119460

llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 2,611 Lines • ▼ Show 20 Lines	int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
// Factor (stride) and VectorType=VFxElemType.		// Factor (stride) and VectorType=VFxElemType.
// The Cost accounts only for the shuffle sequence;		// The Cost accounts only for the shuffle sequence;
// The cost of the loads/stores is accounted for separately.		// The cost of the loads/stores is accounted for separately.
//		//
static const CostTblEntry AVX2InterleavedLoadTbl[] = {		static const CostTblEntry AVX2InterleavedLoadTbl[] = {
{ 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8		{ 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
{ 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8		{ 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
{ 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8		{ 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
{ 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8		{ 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
{ 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8		{ 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8

{ 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8		{ 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
{ 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8		{ 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
{ 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8		{ 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
{ 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8		{ 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
{ 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8		{ 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
};		};

static const CostTblEntry AVX2InterleavedStoreTbl[] = {		static const CostTblEntry AVX2InterleavedStoreTbl[] = {
{ 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)		{ 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
{ 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)		{ 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
{ 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)		{ 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
{ 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)		{ 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
{ 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)		{ 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)

{ 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)		{ 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
{ 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)		{ 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
{ 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store)		{ 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
{ 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)		{ 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
{ 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store)		{ 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
};		};

if (Opcode == Instruction::Load) {		if (Opcode == Instruction::Load) {
if (const auto *Entry =		if (const auto *Entry =
CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))		CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
return NumOfMemOps * MemOpCost + Entry->Cost;		return NumOfMemOps * MemOpCost + Entry->Cost;
} else {		} else {
assert(Opcode == Instruction::Store &&		assert(Opcode == Instruction::Store &&
Show All 29 Lines	int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;		unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;

// Get the cost of one memory operation.		// Get the cost of one memory operation.
Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),		Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
LegalVT.getVectorNumElements());		LegalVT.getVectorNumElements());
unsigned MemOpCost =		unsigned MemOpCost =
getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);		getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);

		unsigned VF = VecTy->getVectorNumElements() / Factor;
		MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);

if (Opcode == Instruction::Load) {		if (Opcode == Instruction::Load) {
		// The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
		// contain the cost of the optimized shuffle sequence that the
		// X86InterleavedAccess pass will generate.
		// The cost of loads and stores are computed separately from the table.

		// X86InterleavedAccess support only the following interleaved-access group.
		static const CostTblEntry AVX512InterleavedLoadTbl[] = {
		{3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
		{3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
		{3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
		};

		if (const auto *Entry =
		CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
		return NumOfMemOps * MemOpCost + Entry->Cost;
		//If an entry does not exist, fallback to the default implementation.

// Kind of shuffle depends on number of loaded values.		// Kind of shuffle depends on number of loaded values.
// If we load the entire data in one register, we can use a 1-src shuffle.		// If we load the entire data in one register, we can use a 1-src shuffle.
// Otherwise, we'll merge 2 sources in each operation.		// Otherwise, we'll merge 2 sources in each operation.
TTI::ShuffleKind ShuffleKind =		TTI::ShuffleKind ShuffleKind =
(NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;		(NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;

unsigned ShuffleCost =		unsigned ShuffleCost =
getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);		getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
Show All 26 Lines	int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
NumOfUnfoldedLoads * MemOpCost + NumOfMoves;		NumOfUnfoldedLoads * MemOpCost + NumOfMoves;

return Cost;		return Cost;
}		}

// Store.		// Store.
assert(Opcode == Instruction::Store &&		assert(Opcode == Instruction::Store &&
"Expected Store Instruction at this point");		"Expected Store Instruction at this point");
		// X86InterleavedAccess support only the following interleaved-access group.
		static const CostTblEntry AVX512InterleavedStoreTbl[] = {
		{3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
		{3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
		{3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)

		{4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
		{4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
		{4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
		{4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
		};

		if (const auto *Entry =
		CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
		return NumOfMemOps * MemOpCost + Entry->Cost;
		//If an entry does not exist, fallback to the default implementation.

// There is no strided stores meanwhile. And store can't be folded in		// There is no strided stores meanwhile. And store can't be folded in
// shuffle.		// shuffle.
unsigned NumOfSources = Factor; // The number of values to be merged.		unsigned NumOfSources = Factor; // The number of values to be merged.
unsigned ShuffleCost =		unsigned ShuffleCost =
getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);		getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
unsigned NumOfShufflesPerStore = NumOfSources - 1;		unsigned NumOfShufflesPerStore = NumOfSources - 1;

Show All 37 Lines

llvm/trunk/test/Analysis/CostModel/X86/interleaved-load-i8.ll

	; REQUIRES: asserts			; REQUIRES: asserts
	; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 \| FileCheck %s			; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 \| FileCheck %s

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	; Function Attrs: norecurse nounwind readonly uwtable			; Function Attrs: norecurse nounwind readonly uwtable
	define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels) {			define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels) {
	;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8			;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8
	;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8			;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8
	;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8			;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8
	;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8			;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8
	;CHECK: LV: Found an estimated cost of 20 for VF 16 For instruction: %0 = load i8			;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: %0 = load i8
	;CHECK: LV: Found an estimated cost of 45 for VF 32 For instruction: %0 = load i8			;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: %0 = load i8
	entry:			entry:
	%cmp13 = icmp sgt i32 %Nels, 0			%cmp13 = icmp sgt i32 %Nels, 0
	br i1 %cmp13, label %for.body.preheader, label %for.end			br i1 %cmp13, label %for.body.preheader, label %for.end

	for.body.preheader:			for.body.preheader:
	br label %for.body			br label %for.body

	for.body:			for.body:
	▲ Show 20 Lines • Show All 76 Lines • Show Last 20 Lines

llvm/trunk/test/Analysis/CostModel/X86/interleaved-store-i8.ll

; REQUIRES: asserts		; REQUIRES: asserts
; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 \| FileCheck %s		; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 \| FileCheck %s

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"		target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"		target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: norecurse nounwind uwtable		; Function Attrs: norecurse nounwind uwtable
define void @doit_stride3(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {		define void @doit_stride3(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {
;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv4		;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv4
;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %conv4		;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %conv4
;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction: store i8 %conv4		;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction: store i8 %conv4
;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction: store i8 %conv4		;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction: store i8 %conv4
;CHECK: LV: Found an estimated cost of 19 for VF 16 For instruction: store i8 %conv4		;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %conv4
;CHECK: LV: Found an estimated cost of 35 for VF 32 For instruction: store i8 %conv4		;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv4
entry:		entry:
%cmp14 = icmp sgt i32 %Nels, 0		%cmp14 = icmp sgt i32 %Nels, 0
br i1 %cmp14, label %for.body.lr.ph, label %for.end		br i1 %cmp14, label %for.body.lr.ph, label %for.end

for.body.lr.ph:		for.body.lr.ph:
%conv = trunc i32 %Nels to i8		%conv = trunc i32 %Nels to i8
%conv1 = shl i8 %conv, 1		%conv1 = shl i8 %conv, 1
%conv4 = shl i8 %conv, 2		%conv4 = shl i8 %conv, 2
Show All 19 Lines	for.end:
ret void		ret void
}		}

; Function Attrs: norecurse nounwind uwtable		; Function Attrs: norecurse nounwind uwtable
define void @doit_stride4(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {		define void @doit_stride4(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {
;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv7		;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv7
;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: store i8 %conv7		;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: store i8 %conv7
;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction: store i8 %conv7		;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction: store i8 %conv7
;CHECK: LV: Found an estimated cost of 17 for VF 8 For instruction: store i8 %conv7		;CHECK: LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %conv7
;CHECK: LV: Found an estimated cost of 22 for VF 16 For instruction: store i8 %conv7		;CHECK: LV: Found an estimated cost of 12 for VF 16 For instruction: store i8 %conv7
;CHECK: LV: Found an estimated cost of 44 for VF 32 For instruction: store i8 %conv7		;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv7
entry:		entry:
%cmp19 = icmp sgt i32 %Nels, 0		%cmp19 = icmp sgt i32 %Nels, 0
br i1 %cmp19, label %for.body.lr.ph, label %for.end		br i1 %cmp19, label %for.body.lr.ph, label %for.end

for.body.lr.ph:		for.body.lr.ph:
%conv = trunc i32 %Nels to i8		%conv = trunc i32 %Nels to i8
%conv1 = shl i8 %conv, 1		%conv1 = shl i8 %conv, 1
%conv4 = shl i8 %conv, 2		%conv4 = shl i8 %conv, 2
Show All 25 Lines

llvm/trunk/test/Analysis/CostModel/X86/strided-load-i8.ll

	Show All 35 Lines
	}			}

	define void @load_i8_stride3() {			define void @load_i8_stride3() {
	;CHECK-LABEL: load_i8_stride3			;CHECK-LABEL: load_i8_stride3
	;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load			;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load
	;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load			;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load
	;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load			;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load
	;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load			;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load
	;CHECK: Found an estimated cost of 8 for VF 16 For instruction: %1 = load			;CHECK: Found an estimated cost of 13 for VF 16 For instruction: %1 = load
	;CHECK: Found an estimated cost of 20 for VF 32 For instruction: %1 = load			;CHECK: Found an estimated cost of 16 for VF 32 For instruction: %1 = load
	;CHECK: Found an estimated cost of 39 for VF 64 For instruction: %1 = load			;CHECK: Found an estimated cost of 25 for VF 64 For instruction: %1 = load
	entry:			entry:
	br label %for.body			br label %for.body

	for.body: ; preds = %for.body, %entry			for.body: ; preds = %for.body, %entry
	%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]			%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
	%0 = mul nsw i64 %indvars.iv, 3			%0 = mul nsw i64 %indvars.iv, 3
	%arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0			%arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
	%1 = load i8, i8* %arrayidx, align 2			%1 = load i8, i8* %arrayidx, align 2
	▲ Show 20 Lines • Show All 63 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

Cost calculation for interleave load/store patterns {v8i8,v16i8,v32i8,v64i8}
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 119460

llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/trunk/test/Analysis/CostModel/X86/interleaved-load-i8.ll

llvm/trunk/test/Analysis/CostModel/X86/interleaved-store-i8.ll

llvm/trunk/test/Analysis/CostModel/X86/strided-load-i8.ll

This is an archive of the discontinued LLVM Phabricator instance.

Cost calculation for interleave load/store patterns {v8i8,v16i8,v32i8,v64i8}ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 119460

llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/trunk/test/Analysis/CostModel/X86/interleaved-load-i8.ll

llvm/trunk/test/Analysis/CostModel/X86/interleaved-store-i8.ll

llvm/trunk/test/Analysis/CostModel/X86/strided-load-i8.ll

Cost calculation for interleave load/store patterns {v8i8,v16i8,v32i8,v64i8}
ClosedPublic