Diff 346236

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,134 Lines • ▼ Show 20 Lines	if (VectorizableTree.size() == 1 &&
VectorizableTree[0]->State == TreeEntry::Vectorize)		VectorizableTree[0]->State == TreeEntry::Vectorize)
return true;		return true;

if (VectorizableTree.size() != 2)		if (VectorizableTree.size() != 2)
return false;		return false;

// Handle splat and all-constants stores. Also try to vectorize tiny trees		// Handle splat and all-constants stores. Also try to vectorize tiny trees
// with the second gather nodes if they have less scalar operands rather than		// with the second gather nodes if they have less scalar operands rather than
// the initial tree element (may be profitable to shuffle the second gather).		// the initial tree element (may be profitable to shuffle the second gather)
		// or they are extractelements, which form shuffle.
		SmallVector<int> Mask;
		david-armUnsubmitted Not Done Reply Inline Actions Does the comment need updating here to reflect the change? david-arm: Does the comment need updating here to reflect the change?
if (VectorizableTree[0]->State == TreeEntry::Vectorize &&		if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
(allConstant(VectorizableTree[1]->Scalars) \|\|		(allConstant(VectorizableTree[1]->Scalars) \|\|
isSplat(VectorizableTree[1]->Scalars) \|\|		isSplat(VectorizableTree[1]->Scalars) \|\|
(VectorizableTree[1]->State == TreeEntry::NeedToGather &&		(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
VectorizableTree[1]->Scalars.size() <		VectorizableTree[1]->Scalars.size() <
VectorizableTree[0]->Scalars.size())))		VectorizableTree[0]->Scalars.size()) \|\|
		(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
		VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&
		isShuffle(VectorizableTree[1]->Scalars, Mask))))
return true;		return true;

// Gathering cost would be too much for tiny trees.		// Gathering cost would be too much for tiny trees.
if (VectorizableTree[0]->State == TreeEntry::NeedToGather \|\|		if (VectorizableTree[0]->State == TreeEntry::NeedToGather \|\|
VectorizableTree[1]->State == TreeEntry::NeedToGather)		VectorizableTree[1]->State == TreeEntry::NeedToGather)
return false;		return false;

return true;		return true;
▲ Show 20 Lines • Show All 4,009 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

	Show All 11 Lines
	; CHECK-NEXT: [[Z0:%.]] = zext <4 x i16> [[A:%.]] to <4 x i32>			; CHECK-NEXT: [[Z0:%.]] = zext <4 x i16> [[A:%.]] to <4 x i32>
	; CHECK-NEXT: [[Z1:%.]] = zext <4 x i16> [[B:%.]] to <4 x i32>			; CHECK-NEXT: [[Z1:%.]] = zext <4 x i16> [[B:%.]] to <4 x i32>
	; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]			; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
	; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0			; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
	; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64			; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64
	; CHECK-NEXT: [[GEP0:%.]] = getelementptr inbounds i64, i64 [[P:%.*]], i64 [[S0]]			; CHECK-NEXT: [[GEP0:%.]] = getelementptr inbounds i64, i64 [[P:%.*]], i64 [[S0]]
	; CHECK-NEXT: [[LOAD0:%.]] = load i64, i64 [[GEP0]], align 4			; CHECK-NEXT: [[LOAD0:%.]] = load i64, i64 [[GEP0]], align 4
	; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1			; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
	; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64
	; CHECK-NEXT: [[GEP1:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S1]]
	; CHECK-NEXT: [[LOAD1:%.]] = load i64, i64 [[GEP1]], align 4
	; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2			; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
	; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64			; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[E1]], i32 0
	; CHECK-NEXT: [[GEP2:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S2]]			; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E2]], i32 1
				; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
				; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
				; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
				; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
				; CHECK-NEXT: [[GEP1:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[TMP5]]
				; CHECK-NEXT: [[LOAD1:%.]] = load i64, i64 [[GEP1]], align 4
				; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
				; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
				; CHECK-NEXT: [[GEP2:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[TMP7]]
				david-armUnsubmitted Not Done Reply Inline Actions At first glance this looks worse, but I've tried out your patch and can see the generated code is the same because the entire first sequence of inserts, sext and trunc get folded away, since the sext + trunc is basically a no-op. david-arm: At first glance this looks worse, but I've tried out your patch and can see the generated code…
				ABataevAuthorUnsubmitted Done Reply Inline Actions Yeah, llvm-mca gives throughput 13.5 without being vectorized and 15.5 with vectorized call (the diff is less for newer processors). Looks like another example of a known problem with too optimistic user cost compensation. This must go away once we land the proper implementation of insertelement instruction vectorization but I'll try to prepare a temp patch to try to improve the situation with this temporarily. ABataev: Yeah, llvm-mca gives throughput 13.5 without being vectorized and 15.5 with vectorized call…
	; CHECK-NEXT: [[LOAD2:%.]] = load i64, i64 [[GEP2]], align 4			; CHECK-NEXT: [[LOAD2:%.]] = load i64, i64 [[GEP2]], align 4
	; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3			; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
	; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64			; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64
	; CHECK-NEXT: [[GEP3:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S3]]			; CHECK-NEXT: [[GEP3:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S3]]
	; CHECK-NEXT: [[LOAD3:%.]] = load i64, i64 [[GEP3]], align 4			; CHECK-NEXT: [[LOAD3:%.]] = load i64, i64 [[GEP3]], align 4
	; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])			; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	▲ Show 20 Lines • Show All 78 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 346236

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 346236

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.
ClosedPublic