Diff 346754

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,134 Lines • ▼ Show 20 Lines	if (VectorizableTree.size() == 1 &&
VectorizableTree[0]->State == TreeEntry::Vectorize)		VectorizableTree[0]->State == TreeEntry::Vectorize)
return true;		return true;

if (VectorizableTree.size() != 2)		if (VectorizableTree.size() != 2)
return false;		return false;

// Handle splat and all-constants stores. Also try to vectorize tiny trees		// Handle splat and all-constants stores. Also try to vectorize tiny trees
// with the second gather nodes if they have less scalar operands rather than		// with the second gather nodes if they have less scalar operands rather than
// the initial tree element (may be profitable to shuffle the second gather).		// the initial tree element (may be profitable to shuffle the second gather)
		// or they are extractelements, which form shuffle.
		SmallVector<int> Mask;
		david-armUnsubmitted Not Done Reply Inline Actions Does the comment need updating here to reflect the change? david-arm: Does the comment need updating here to reflect the change?
if (VectorizableTree[0]->State == TreeEntry::Vectorize &&		if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
(allConstant(VectorizableTree[1]->Scalars) \|\|		(allConstant(VectorizableTree[1]->Scalars) \|\|
isSplat(VectorizableTree[1]->Scalars) \|\|		isSplat(VectorizableTree[1]->Scalars) \|\|
(VectorizableTree[1]->State == TreeEntry::NeedToGather &&		(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
VectorizableTree[1]->Scalars.size() <		VectorizableTree[1]->Scalars.size() <
VectorizableTree[0]->Scalars.size())))		VectorizableTree[0]->Scalars.size()) \|\|
		(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
		VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&
		isShuffle(VectorizableTree[1]->Scalars, Mask))))
return true;		return true;

// Gathering cost would be too much for tiny trees.		// Gathering cost would be too much for tiny trees.
if (VectorizableTree[0]->State == TreeEntry::NeedToGather \|\|		if (VectorizableTree[0]->State == TreeEntry::NeedToGather \|\|
VectorizableTree[1]->State == TreeEntry::NeedToGather)		VectorizableTree[1]->State == TreeEntry::NeedToGather)
return false;		return false;

return true;		return true;
▲ Show 20 Lines • Show All 1,925 Lines • ▼ Show 20 Lines	static bool collectValuesToDemote(Value V, SmallPtrSetImpl<Value > &Expr,

// We can always demote truncations and extensions. Since truncations can		// We can always demote truncations and extensions. Since truncations can
// seed additional demotion, we save the truncated value.		// seed additional demotion, we save the truncated value.
case Instruction::Trunc:		case Instruction::Trunc:
Roots.push_back(I->getOperand(0));		Roots.push_back(I->getOperand(0));
break;		break;
case Instruction::ZExt:		case Instruction::ZExt:
case Instruction::SExt:		case Instruction::SExt:
		if (isa<ExtractElementInst>(I->getOperand(0)) \|\|
		isa<InsertElementInst>(I->getOperand(0)))
		return false;
break;		break;

// We can demote certain binary operations if we can demote both of their		// We can demote certain binary operations if we can demote both of their
// operands.		// operands.
case Instruction::Add:		case Instruction::Add:
case Instruction::Sub:		case Instruction::Sub:
case Instruction::Mul:		case Instruction::Mul:
case Instruction::And:		case Instruction::And:
▲ Show 20 Lines • Show All 2,066 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu < %s \| FileCheck %s			; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu < %s \| FileCheck %s

	target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"			target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"

	declare void @foo(i64, i64, i64, i64)			declare void @foo(i64, i64, i64, i64)

	define void @test1(<4 x i16> %a, <4 x i16> %b, i64* %p) {			define void @test1(<4 x i16> %a, <4 x i16> %b, i64* %p) {
	; Make sure types of sub and its sources are not extended.			; Make sure types of sub and its sources are not extended.
	; CHECK-LABEL: @test1(			; CHECK-LABEL: @test1(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[Z0:%.]] = zext <4 x i16> [[A:%.]] to <4 x i32>			; CHECK-NEXT: [[Z0:%.]] = zext <4 x i16> [[A:%.]] to <4 x i32>
	; CHECK-NEXT: [[Z1:%.]] = zext <4 x i16> [[B:%.]] to <4 x i32>			; CHECK-NEXT: [[Z1:%.]] = zext <4 x i16> [[B:%.]] to <4 x i32>
	; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]			; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
	; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0			; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
	; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64			; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
	; CHECK-NEXT: [[GEP0:%.]] = getelementptr inbounds i64, i64 [[P:%.*]], i64 [[S0]]			; CHECK-NEXT: [[GEP0:%.]] = getelementptr inbounds i64, i64 [[P:%.*]], i64 [[TMP1]]
	; CHECK-NEXT: [[LOAD0:%.]] = load i64, i64 [[GEP0]], align 4			; CHECK-NEXT: [[LOAD0:%.]] = load i64, i64 [[GEP0]], align 4
	; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1			; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
	; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64			; CHECK-NEXT: [[GEP1:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[TMP2]]
	; CHECK-NEXT: [[GEP1:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S1]]
	; CHECK-NEXT: [[LOAD1:%.]] = load i64, i64 [[GEP1]], align 4			; CHECK-NEXT: [[LOAD1:%.]] = load i64, i64 [[GEP1]], align 4
	; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2			; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
	; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64			; CHECK-NEXT: [[GEP2:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[TMP3]]
				david-armUnsubmitted Not Done Reply Inline Actions At first glance this looks worse, but I've tried out your patch and can see the generated code is the same because the entire first sequence of inserts, sext and trunc get folded away, since the sext + trunc is basically a no-op. david-arm: At first glance this looks worse, but I've tried out your patch and can see the generated code…
				ABataevAuthorUnsubmitted Done Reply Inline Actions Yeah, llvm-mca gives throughput 13.5 without being vectorized and 15.5 with vectorized call (the diff is less for newer processors). Looks like another example of a known problem with too optimistic user cost compensation. This must go away once we land the proper implementation of insertelement instruction vectorization but I'll try to prepare a temp patch to try to improve the situation with this temporarily. ABataev: Yeah, llvm-mca gives throughput 13.5 without being vectorized and 15.5 with vectorized call…
	; CHECK-NEXT: [[GEP2:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S2]]
	; CHECK-NEXT: [[LOAD2:%.]] = load i64, i64 [[GEP2]], align 4			; CHECK-NEXT: [[LOAD2:%.]] = load i64, i64 [[GEP2]], align 4
	; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3			; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
	; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64			; CHECK-NEXT: [[GEP3:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[TMP4]]
	; CHECK-NEXT: [[GEP3:%.]] = getelementptr inbounds i64, i64 [[P]], i64 [[S3]]
	; CHECK-NEXT: [[LOAD3:%.]] = load i64, i64 [[GEP3]], align 4			; CHECK-NEXT: [[LOAD3:%.]] = load i64, i64 [[GEP3]], align 4
	; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])			; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	entry:			entry:
	%z0 = zext <4 x i16> %a to <4 x i32>			%z0 = zext <4 x i16> %a to <4 x i32>
	%z1 = zext <4 x i16> %b to <4 x i32>			%z1 = zext <4 x i16> %b to <4 x i32>
	%sub0 = sub <4 x i32> %z0, %z1			%sub0 = sub <4 x i32> %z0, %z1
	▲ Show 20 Lines • Show All 74 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 346754

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 346754

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.
ClosedPublic