Diff 348329

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show All 16 Lines
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Vectorize/SLPVectorizer.h"		#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/DenseMap.h"		#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"		#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/Optional.h"		#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"		#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
		#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"		#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"		#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"		#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"		#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallString.h"		#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"		#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"		#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"		#include "llvm/ADT/iterator_range.h"
▲ Show 20 Lines • Show All 3,651 Lines • ▼ Show 20 Lines	for (const auto &Data : ExtractVectorsTys) {
Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,		Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
VecTy, None, 0, EEVTy);		VecTy, None, 0, EEVTy);
}		}
}		}
};		};
if (E->State == TreeEntry::NeedToGather) {		if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))		if (allConstant(VL))
return 0;		return 0;
if (isSplat(VL)) {
return ReuseShuffleCost +
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
0);
}
if (isa<InsertElementInst>(VL[0]))		if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();		return InstructionCost::getInvalid();
if (E->getOpcode() == Instruction::ExtractElement &&
allSameType(VL) && allSameBlock(VL)) {
SmallVector<int> Mask;
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
isShuffle(VL, Mask);
if (ShuffleKind.hasValue()) {
InstructionCost Cost =
computeExtractCost(VL, VecTy, ShuffleKind, Mask, TTI);
AdjustExtractsCost(Cost, /IsGather=/true);
return ReuseShuffleCost + Cost;
}
}
InstructionCost GatherCost = 0;
SmallVector<int> Mask;		SmallVector<int> Mask;
SmallVector<const TreeEntry *> Entries;		SmallVector<const TreeEntry *> Entries;
Optional<TargetTransformInfo::ShuffleKind> Shuffle =		Optional<TargetTransformInfo::ShuffleKind> Shuffle =
isGatherShuffledEntry(E, Mask, Entries);		isGatherShuffledEntry(E, Mask, Entries);
if (Shuffle.hasValue()) {		if (Shuffle.hasValue()) {
		InstructionCost GatherCost = 0;
if (ShuffleVectorInst::isIdentityMask(Mask)) {		if (ShuffleVectorInst::isIdentityMask(Mask)) {
		// Perfect match in the graph, will reuse the previously vectorized
		// node. Cost is 0.
LLVM_DEBUG(		LLVM_DEBUG(
dbgs()		dbgs()
<< "SLP: perfect diamond match for gather bundle that starts with "		<< "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n");		<< *VL.front() << ".\n");
} else {		} else {
LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()		LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
<< " entries for bundle that starts with "		<< " entries for bundle that starts with "
<< *VL.front() << ".\n");		<< *VL.front() << ".\n");
		// Detected that instead of gather we can emit a shuffle of single/two
		// previously vectorized nodes. Add the cost of the permutation rather
		// than gather.
GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);		GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
}		}
} else {
GatherCost = getGatherCost(VL);
}
return ReuseShuffleCost + GatherCost;		return ReuseShuffleCost + GatherCost;
}		}
		if (isSplat(VL)) {
		echristoUnsubmitted Not Done Reply Inline Actions This needs to be documented as to what you're doing and what it means in the code. echristo: This needs to be documented as to what you're doing and what it means in the code.
		// Found the broadcasting of the single scalar, calculate the cost as the
		// broadcast.
		return ReuseShuffleCost +
		TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
		0);
		}
		if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
		allSameBlock(VL)) {
		// Check that gather of extractelements can be represented as just a
		// shuffle of a single/two vectors the scalars are extracted from.
		SmallVector<int> Mask;
		Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
		isShuffle(VL, Mask);
		if (ShuffleKind.hasValue()) {
		// Found the bunch of extractelement instructions that must be gathered
		// into a vector and can be represented as a permutation elements in a
		// single input vector or of 2 input vectors.
		InstructionCost Cost =
		computeExtractCost(VL, VecTy, ShuffleKind, Mask, TTI);
		AdjustExtractsCost(Cost, /IsGather=/true);
		return ReuseShuffleCost + Cost;
		}
		}
		return ReuseShuffleCost + getGatherCost(VL);
		echristoUnsubmitted Not Done Reply Inline Actions Same :) echristo: Same :)
		}
assert((E->State == TreeEntry::Vectorize \|\|		assert((E->State == TreeEntry::Vectorize \|\|
E->State == TreeEntry::ScatterVectorize) &&		E->State == TreeEntry::ScatterVectorize) &&
"Unhandled state");		"Unhandled state");
assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");		assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = E->getMainOp();		Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =		unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();		E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
switch (ShuffleOrOp) {		switch (ShuffleOrOp) {
▲ Show 20 Lines • Show All 671 Lines • ▼ Show 20 Lines	if (EU.User && isa<InsertElementInst>(EU.User)) {
continue;		continue;
Value *VU = EU.User;		Value *VU = EU.User;
auto It = find_if(FirstUsers, [VU](Value V) {		auto It = find_if(FirstUsers, [VU](Value V) {
// Checks if 2 insertelements are from the same buildvector.		// Checks if 2 insertelements are from the same buildvector.
if (VU->getType() != V->getType())		if (VU->getType() != V->getType())
return false;		return false;
auto *IE1 = cast<InsertElementInst>(VU);		auto *IE1 = cast<InsertElementInst>(VU);
auto *IE2 = cast<InsertElementInst>(V);		auto *IE2 = cast<InsertElementInst>(V);
		// Go though of insertelement instructions trying to find either VU as
		// the original vector for IE2 or V as the original vector for IE1.
do {		do {
if (IE1 == VU \|\| IE2 == V)		if (IE1 == VU \|\| IE2 == V)
return true;		return true;
if (IE1)		if (IE1)
IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));		IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
if (IE2)		if (IE2)
IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));		IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
} while (IE1 \|\| IE2);		} while (IE1 \|\| IE2);
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines
#endif		#endif

return Cost;		return Cost;
}		}

Optional<TargetTransformInfo::ShuffleKind>		Optional<TargetTransformInfo::ShuffleKind>
BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,		BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
SmallVectorImpl<const TreeEntry *> &Entries) {		SmallVectorImpl<const TreeEntry *> &Entries) {
		// TODO: currently checking only for Scalars in the tree entry, need to count
		// reused elements too for better cost estimation.
Mask.assign(TE->Scalars.size(), UndefMaskElem);		Mask.assign(TE->Scalars.size(), UndefMaskElem);
Entries.clear();		Entries.clear();
DenseMap<Value , const TreeEntry > UsedValuesEntry;		// Build a lists of values to tree entries.
		DenseMap<Value , SmallPtrSet<const TreeEntry , 4>> ValueToTEs;
		for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
		if (EntryPtr.get() == TE)
		break;
		if (EntryPtr->State != TreeEntry::NeedToGather)
		continue;
		for (Value *V : EntryPtr->Scalars)
		ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
		}
		// Find all tree entries used by the gathered values. If no common entries
		// found - not a shuffle.
		// Here we build a set of tree nodes for each gathered value and trying to
		echristoUnsubmitted Not Done Reply Inline Actions Please document this :) echristo: Please document this :)
		// find the intersection between these sets. If we have at least one common
		// tree node for each gathered value - we have just a permutation of the
		// single vector. If we have 2 different sets, we're in situation where we
		// have a permutation of 2 input vectors.
		SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
		DenseMap<Value *, int> UsedValuesEntry;
		for (Value *V : TE->Scalars) {
		if (isa<UndefValue>(V))
		continue;
		// Build a list of tree entries where V is used.
		SmallPtrSet<const TreeEntry *, 4> VToTEs;
		auto It = ValueToTEs.find(V);
		if (It != ValueToTEs.end())
		VToTEs = It->second;
		if (const TreeEntry *VTE = getTreeEntry(V))
		VToTEs.insert(VTE);
		if (VToTEs.empty())
		return None;
		if (UsedTEs.empty()) {
		// The first iteration, just insert the list of nodes to vector.
		UsedTEs.push_back(VToTEs);
		} else {
		// Need to check if there are any previously used tree nodes which use V.
		// If there are no such nodes, consider that we have another one input
		// vector.
		SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
		unsigned Idx = 0;
		for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
		// Do we have a non-empty intersection of previously listed tree entries
		// and tree entries using current V?
		set_intersect(VToTEs, Set);
		if (!VToTEs.empty()) {
		// Yes, write the new subset and continue analysis for the next
		// scalar.
		Set.swap(VToTEs);
		break;
		}
		VToTEs = SavedVToTEs;
		++Idx;
		}
		// No non-empty intersection found - need to add a second set of possible
		// source vectors.
		if (Idx == UsedTEs.size()) {
		// If the number of input vectors is greater than 2 - not a permutation,
		// fallback to the regular gather.
		if (UsedTEs.size() == 2)
		return None;
		UsedTEs.push_back(SavedVToTEs);
		Idx = UsedTEs.size() - 1;
		}
		UsedValuesEntry.try_emplace(V, Idx);
		}
		}

unsigned VF = 0;		unsigned VF = 0;
		if (UsedTEs.size() == 1) {
		// Try to find the perfect match in another gather node at first.
		auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) {
		return EntryPtr->isSame(TE->Scalars);
		});
		if (It != UsedTEs.front().end()) {
		Entries.push_back(*It);
		std::iota(Mask.begin(), Mask.end(), 0);
		return TargetTransformInfo::SK_PermuteSingleSrc;
		}
		// No perfect match, just shuffle, so choose the first tree node.
		Entries.push_back(*UsedTEs.front().begin());
		} else {
		// Try to find nodes with the same vector factor.
		assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
// FIXME: Shall be replaced by GetVF function once non-power-2 patch is		// FIXME: Shall be replaced by GetVF function once non-power-2 patch is
// landed.		// landed.
auto &&GetVF = [](const TreeEntry *TE) {		auto &&GetVF = [](const TreeEntry *TE) {
if (!TE->ReuseShuffleIndices.empty())		if (!TE->ReuseShuffleIndices.empty())
return TE->ReuseShuffleIndices.size();		return TE->ReuseShuffleIndices.size();
return TE->Scalars.size();		return TE->Scalars.size();
};		};
		DenseMap<int, const TreeEntry *> VFToTE;
		for (const TreeEntry *TE : UsedTEs.front())
		VFToTE.try_emplace(GetVF(TE), TE);
		for (const TreeEntry *TE : UsedTEs.back()) {
		auto It = VFToTE.find(GetVF(TE));
		if (It != VFToTE.end()) {
		VF = It->first;
		Entries.push_back(It->second);
		Entries.push_back(TE);
		break;
		}
		}
		// No 2 source vectors with the same vector factor - give up and do regular
		// gather.
		if (Entries.empty())
		return None;
		}

		// Build a shuffle mask for better cost estimation and vector emission.
for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {		for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
Value *V = TE->Scalars[I];		Value *V = TE->Scalars[I];
if (isa<UndefValue>(V))		if (isa<UndefValue>(V))
continue;		continue;
const TreeEntry *VTE = UsedValuesEntry.lookup(V);		unsigned Idx = UsedValuesEntry.lookup(V);
if (!VTE) {		const TreeEntry *VTE = Entries[Idx];
if (Entries.size() == 2)
return None;
VTE = getTreeEntry(V);
if (!VTE \|\| find_if(
VectorizableTree,
[VTE, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
return EntryPtr.get() == VTE \|\| EntryPtr.get() == TE;
})->get() == TE) {
// Check if it is used in one of the gathered entries.
const auto *It =
find_if(VectorizableTree,
[V, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
return EntryPtr.get() == TE \|\|
(EntryPtr->State == TreeEntry::NeedToGather &&
is_contained(EntryPtr->Scalars, V));
});
// The vector factor of shuffled entries must be the same.
if (It->get() == TE)
return None;
VTE = It->get();
}
Entries.push_back(VTE);
if (Entries.size() == 1) {
VF = GetVF(VTE);
} else if (VF != GetVF(VTE)) {
assert(Entries.size() == 2 && "Expected shuffle of 1 or 2 entries.");
assert(VF > 0 && "Expected non-zero vector factor.");
return None;
}
for (Value *SV : VTE->Scalars)
UsedValuesEntry.try_emplace(SV, VTE);
}
int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V);		int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V);
Mask[I] = (Entries.front() == VTE ? 0 : VF) + FoundLane;		Mask[I] = Idx * VF + FoundLane;
// Extra check required by isSingleSourceMaskImpl function (called by		// Extra check required by isSingleSourceMaskImpl function (called by
// ShuffleVectorInst::isSingleSourceMask).		// ShuffleVectorInst::isSingleSourceMask).
if (Mask[I] >= 2 * E)		if (Mask[I] >= 2 * E)
return None;		return None;
}		}
switch (Entries.size()) {		switch (Entries.size()) {
case 1:		case 1:
return TargetTransformInfo::SK_PermuteSingleSrc;		return TargetTransformInfo::SK_PermuteSingleSrc;
▲ Show 20 Lines • Show All 3,723 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll

Show All 14 Lines	;
%ins2 = insertelement <2 x i8> %ins1, i8 %y1y1, i32 1		%ins2 = insertelement <2 x i8> %ins1, i8 %y1y1, i32 1
ret <2 x i8> %ins2		ret <2 x i8> %ins2
}		}

define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {		define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @h(		; CHECK-LABEL: @h(
; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i8> [[X:%.]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>		; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i8> [[X:%.]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]		; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
; CHECK-NEXT: ret <4 x i8> [[TMP2]]		; CHECK-NEXT: ret <4 x i8> [[TMP2]]
;		;
%x0 = extractelement <4 x i8> %x, i32 0		%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3		%x3 = extractelement <4 x i8> %x, i32 3
%y1 = extractelement <4 x i8> %y, i32 1		%y1 = extractelement <4 x i8> %y, i32 1
%y2 = extractelement <4 x i8> %y, i32 2		%y2 = extractelement <4 x i8> %y, i32 2
%x0x0 = mul i8 %x0, %x0		%x0x0 = mul i8 %x0, %x0
%x3x3 = mul i8 %x3, %x3		%x3x3 = mul i8 %x3, %x3
%y1y1 = mul i8 %y1, %y1		%y1y1 = mul i8 %y1, %y1
%y2y2 = mul i8 %y2, %y2		%y2y2 = mul i8 %y2, %y2
%ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0		%ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
%ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1		%ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
%ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2		%ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
%ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3		%ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
ret <4 x i8> %ins4		ret <4 x i8> %ins4
}		}

define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {		define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @h_undef(		; CHECK-LABEL: @h_undef(
; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i8> [[X:%.]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 undef, i32 3, i32 5, i32 6>		; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i8> [[X:%.]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 undef, i32 3, i32 5, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]		; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
; CHECK-NEXT: ret <4 x i8> [[TMP2]]		; CHECK-NEXT: ret <4 x i8> [[TMP2]]
;		;
		ABataevAuthorUnsubmitted Done Reply Inline Actions Regression is caused by the incorrect cost model. It returns cost 12 for mul <4 x i8> and it is compensated by the fact that we calculate the cost of gather of extractelement instructions twice (its cost is -3). After this patch we correctly calculate the cost for the gather node only once (-3 for the first gather and 0 for the second one, perfect diamond match). llvm-mca returns the cost of code is 2 or 4 (for normalized mul <16 x i8> it is 4, for the original code it is 2). Need to tweak the cost model. ABataev: Regression is caused by the incorrect cost model. It returns cost 12 for mul <4 x i8> and it is…
		RKSimonUnsubmitted Not Done Reply Inline Actions I'll handle this regression - I'm reviewing the cost tables against llvm-mca reports at the moment. RKSimon: I'll handle this regression - I'm reviewing the cost tables against llvm-mca reports at the…
%x0 = extractelement <4 x i8> undef, i32 0		%x0 = extractelement <4 x i8> undef, i32 0
%x3 = extractelement <4 x i8> %x, i32 3		%x3 = extractelement <4 x i8> %x, i32 3
%y1 = extractelement <4 x i8> %y, i32 1		%y1 = extractelement <4 x i8> %y, i32 1
%y2 = extractelement <4 x i8> %y, i32 2		%y2 = extractelement <4 x i8> %y, i32 2
%x0x0 = mul i8 %x0, %x0		%x0x0 = mul i8 %x0, %x0
%x3x3 = mul i8 %x3, %x3		%x3x3 = mul i8 %x3, %x3
%y1y1 = mul i8 %y1, %y1		%y1y1 = mul i8 %y1, %y1
%y2y2 = mul i8 %y2, %y2		%y2y2 = mul i8 %y2, %y2
Show All 22 Lines	;
%1 = add i8 %x0x0, %x3x3		%1 = add i8 %x0x0, %x3x3
%2 = add i8 %y1y1, %y2y2		%2 = add i8 %y1y1, %y2y2
%3 = add i8 %1, %2		%3 = add i8 %1, %2
ret i8 %3		ret i8 %3
}		}

define i8 @j(<4 x i8> %x, <4 x i8> %y) {		define i8 @j(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @j(		; CHECK-LABEL: @j(
; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i8> [[X:%.]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>		; CHECK-NEXT: [[X0:%.]] = extractelement <4 x i8> [[X:%.]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]		; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>		; CHECK-NEXT: [[Y1:%.]] = extractelement <4 x i8> [[Y:%.]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]		; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]		; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0		; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1		; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]		; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
; CHECK-NEXT: ret i8 [[TMP8]]		; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
		; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
		; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
		; CHECK-NEXT: ret i8 [[TMP3]]
;		;
%x0 = extractelement <4 x i8> %x, i32 0		%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3		%x3 = extractelement <4 x i8> %x, i32 3
%y1 = extractelement <4 x i8> %y, i32 1		%y1 = extractelement <4 x i8> %y, i32 1
%y2 = extractelement <4 x i8> %y, i32 2		%y2 = extractelement <4 x i8> %y, i32 2
%x0x0 = mul i8 %x0, %x0		%x0x0 = mul i8 %x0, %x0
%x3x3 = mul i8 %x3, %x3		%x3x3 = mul i8 %x3, %x3
%y1y1 = mul i8 %y1, %y1		%y1y1 = mul i8 %y1, %y1
%y2y2 = mul i8 %y2, %y2		%y2y2 = mul i8 %y2, %y2
%1 = add i8 %x0x0, %x3x3		%1 = add i8 %x0x0, %x3x3
%2 = add i8 %y1y1, %y2y2		%2 = add i8 %y1y1, %y2y2
%3 = sdiv i8 %1, %2		%3 = sdiv i8 %1, %2
ret i8 %3		ret i8 %3
}		}

define i8 @k(<4 x i8> %x) {		define i8 @k(<4 x i8> %x) {
; CHECK-LABEL: @k(		; CHECK-LABEL: @k(
; CHECK-NEXT: [[TMP1:%.]] = mul <4 x i8> [[X:%.]], [[X]]		; CHECK-NEXT: [[X0:%.]] = extractelement <4 x i8> [[X:%.]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>		; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]		; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>		; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]		; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0		; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1		; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]		; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
; CHECK-NEXT: ret i8 [[TMP8]]		; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
		; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
		; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
		; CHECK-NEXT: ret i8 [[TMP3]]
;		;
%x0 = extractelement <4 x i8> %x, i32 0		%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3		%x3 = extractelement <4 x i8> %x, i32 3
%x1 = extractelement <4 x i8> %x, i32 1		%x1 = extractelement <4 x i8> %x, i32 1
%x2 = extractelement <4 x i8> %x, i32 2		%x2 = extractelement <4 x i8> %x, i32 2
%x0x0 = mul i8 %x0, %x0		%x0x0 = mul i8 %x0, %x0
%x3x3 = mul i8 %x3, %x3		%x3x3 = mul i8 %x3, %x3
%x1x1 = mul i8 %x1, %x1		%x1x1 = mul i8 %x1, %x1
Show All 39 Lines

llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll

Show First 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	;
%1 = add i8 %x0x0, %x3x3		%1 = add i8 %x0x0, %x3x3
%2 = add i8 %y1y1, %y2y2		%2 = add i8 %y1y1, %y2y2
%3 = add i8 %1, %2		%3 = add i8 %1, %2
ret i8 %3		ret i8 %3
}		}

define i8 @j(<4 x i8> %x, <4 x i8> %y) {		define i8 @j(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @j(		; CHECK-LABEL: @j(
; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i8> [[X:%.]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>		; CHECK-NEXT: [[X0:%.]] = extractelement <4 x i8> [[X:%.]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]		; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>		; CHECK-NEXT: [[Y1:%.]] = extractelement <4 x i8> [[Y:%.]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]		; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]		; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0		; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1		; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]		; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
; CHECK-NEXT: ret i8 [[TMP8]]		; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
		; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
		; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
		; CHECK-NEXT: ret i8 [[TMP3]]
;		;
%x0 = extractelement <4 x i8> %x, i32 0		%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3		%x3 = extractelement <4 x i8> %x, i32 3
%y1 = extractelement <4 x i8> %y, i32 1		%y1 = extractelement <4 x i8> %y, i32 1
%y2 = extractelement <4 x i8> %y, i32 2		%y2 = extractelement <4 x i8> %y, i32 2
%x0x0 = mul i8 %x0, %x0		%x0x0 = mul i8 %x0, %x0
%x3x3 = mul i8 %x3, %x3		%x3x3 = mul i8 %x3, %x3
%y1y1 = mul i8 %y1, %y1		%y1y1 = mul i8 %y1, %y1
%y2y2 = mul i8 %y2, %y2		%y2y2 = mul i8 %y2, %y2
%1 = add i8 %x0x0, %x3x3		%1 = add i8 %x0x0, %x3x3
%2 = add i8 %y1y1, %y2y2		%2 = add i8 %y1y1, %y2y2
%3 = sdiv i8 %1, %2		%3 = sdiv i8 %1, %2
ret i8 %3		ret i8 %3
}		}

define i8 @k(<4 x i8> %x) {		define i8 @k(<4 x i8> %x) {
; CHECK-LABEL: @k(		; CHECK-LABEL: @k(
; CHECK-NEXT: [[TMP1:%.]] = mul <4 x i8> [[X:%.]], [[X]]		; CHECK-NEXT: [[X0:%.]] = extractelement <4 x i8> [[X:%.]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>		; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]		; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>		; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]		; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0		; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1		; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]		; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
; CHECK-NEXT: ret i8 [[TMP8]]		; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
		; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
		; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
		; CHECK-NEXT: ret i8 [[TMP3]]
;		;
		ABataevAuthorUnsubmitted Done Reply Inline Actions Regressions caused by the incorrect cost of `mul <2 x i8>`. Per mca tool the cost is `2`, cost model reports `3`. ABataev: Regressions caused by the incorrect cost of `mul <2 x i8>`. Per mca tool the cost is `2`, cost…
		RKSimonUnsubmitted Not Done Reply Inline Actions Are we counting the costs of the v4i8 mul twice here? RKSimon: Are we counting the costs of the v4i8 mul twice here?
		ABataevAuthorUnsubmitted Done Reply Inline Actions Yes, but the cost of mul of v2i8. It is extended to mul v4i8 by the instcombine. ABataev: Yes, but the cost of mul of v2i8. It is extended to mul v4i8 by the instcombine.
		RKSimonUnsubmitted Not Done Reply Inline Actions The cost is trickier than that as the costs tables aren't usually cpu specific- the worst case for v2i8 multiply is at least 4, so that's what the cost table reports - incidently, this test is for bdver2 (cost = 3.5), But tbh, I wouldn't worry too much about this scalarization. RKSimon: The cost is trickier than that as the costs tables aren't usually cpu specific- the worst case…
%x0 = extractelement <4 x i8> %x, i32 0		%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3		%x3 = extractelement <4 x i8> %x, i32 3
%x1 = extractelement <4 x i8> %x, i32 1		%x1 = extractelement <4 x i8> %x, i32 1
%x2 = extractelement <4 x i8> %x, i32 2		%x2 = extractelement <4 x i8> %x, i32 2
%x0x0 = mul i8 %x0, %x0		%x0x0 = mul i8 %x0, %x0
%x3x3 = mul i8 %x3, %x3		%x3x3 = mul i8 %x3, %x3
%x1x1 = mul i8 %x1, %x1		%x1x1 = mul i8 %x1, %x1
%x2x2 = mul i8 %x2, %x2		%x2x2 = mul i8 %x2, %x2
Show All 38 Lines

llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-1 \| FileCheck %s			; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-1 \| FileCheck %s

	define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture %A) {			define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture %A) {
	; CHECK-LABEL: @diamond_broadcast(			; CHECK-LABEL: @diamond_broadcast(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[LD:%.]] = load i32, i32 [[A:%.*]], align 4			; CHECK-NEXT: [[LD:%.]] = load i32, i32 [[A:%.*]], align 4
	; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], [[LD]]			; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds i32, i32 [[B:%.*]], i64 1
	; CHECK-NEXT: store i32 [[MUL]], i32* [[B:%.*]], align 4
	; CHECK-NEXT: [[MUL8:%.*]] = mul i32 [[LD]], [[LD]]
	; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds i32, i32 [[B]], i64 1
	; CHECK-NEXT: store i32 [[MUL8]], i32* [[ARRAYIDX9]], align 4
	; CHECK-NEXT: [[MUL14:%.*]] = mul i32 [[LD]], [[LD]]
	; CHECK-NEXT: [[ARRAYIDX15:%.]] = getelementptr inbounds i32, i32 [[B]], i64 2			; CHECK-NEXT: [[ARRAYIDX15:%.]] = getelementptr inbounds i32, i32 [[B]], i64 2
	; CHECK-NEXT: store i32 [[MUL14]], i32* [[ARRAYIDX15]], align 4			; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
	; CHECK-NEXT: [[MUL20:%.*]] = mul i32 [[LD]], [[LD]]			; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[LD]], i32 1
				; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[LD]], i32 2
				; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[LD]], i32 3
				; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], [[TMP3]]
	; CHECK-NEXT: [[ARRAYIDX21:%.]] = getelementptr inbounds i32, i32 [[B]], i64 3			; CHECK-NEXT: [[ARRAYIDX21:%.]] = getelementptr inbounds i32, i32 [[B]], i64 3
	; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4			; CHECK-NEXT: [[TMP5:%.]] = bitcast i32 [[B]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
	; CHECK-NEXT: ret i32 0			; CHECK-NEXT: ret i32 0
	;			;
	entry:			entry:
	%ld = load i32, i32* %A, align 4			%ld = load i32, i32* %A, align 4
	%mul = mul i32 %ld, %ld			%mul = mul i32 %ld, %ld
	store i32 %mul, i32* %B, align 4			store i32 %mul, i32* %B, align 4
	%mul8 = mul i32 %ld, %ld			%mul8 = mul i32 %ld, %ld
	%arrayidx9 = getelementptr inbounds i32, i32* %B, i64 1			%arrayidx9 = getelementptr inbounds i32, i32* %B, i64 1
	Show All 11 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Better detection of perfect/shuffles matches for gather nodes.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 348329

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll

llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Better detection of perfect/shuffles matches for gather nodes.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 348329

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll

llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll

llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll

[SLP]Better detection of perfect/shuffles matches for gather nodes.
ClosedPublic