This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Improve reordering for the nodes beeing used in alternate vectorization.
ClosedPublic

Authored by ABataev on Jan 6 2022, 6:43 AM.

Download Raw Diff

Details

Reviewers

vporpo
RKSimon
anton-afanasyev
dtemirbulatov

Commits

rGd130df544d6c: [SLP]Improve reordering for the nodes beeing used in alternate vectorization.

Summary

No need to include the order of the scalars beeing used as part of the
alternate vectorization into account when trying to reorder the whole
graph. Such elements better to reorder in the following phase because
the subtree still ends up in shuffle.

Part of D116688, fixes the regression in D116690.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

ABataev created this revision.Jan 6 2022, 6:43 AM

Herald added a subscriber: hiraditya. · View Herald TranscriptJan 6 2022, 6:43 AM

ABataev requested review of this revision.Jan 6 2022, 6:43 AM

Herald added a project: Restricted Project. · View Herald TranscriptJan 6 2022, 6:43 AM

Harbormaster completed remote builds in B141894: Diff 397872.Jan 6 2022, 7:14 AM

ABataev mentioned this in D116688: [SLP]Excluded external uses from the reordering estimation..Jan 6 2022, 7:18 AM

LGTM

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
3057	Please explain in the comment why this is better.

This revision is now accepted and ready to land.Jan 6 2022, 10:23 AM

Closed by commit rGd130df544d6c: [SLP]Improve reordering for the nodes beeing used in alternate vectorization. (authored by ABataev). · Explain WhyJan 6 2022, 11:20 AM

This revision was automatically updated to reflect the committed changes.

ABataev added a commit: rGd130df544d6c: [SLP]Improve reordering for the nodes beeing used in alternate vectorization..

I have a question wrt this patch. Consider this test case:
define dso_local void @test(i32* noalias nocapture readonly %0, i32* noalias nocapture readonly %1, i32* noalias nocapture %2) {

%4 = getelementptr inbounds i32, i32* %1, i64 0
%5 = load i32, i32* %4, align 4
%6 = getelementptr inbounds i32, i32* %0, i64 0
%7 = load i32, i32* %6, align 4
%8 = getelementptr inbounds i32, i32* %1, i64 4
%9 = load i32, i32* %8, align 4
%10 = getelementptr inbounds i32, i32* %0, i64 4
%11 = load i32, i32* %10, align 4
%12 = getelementptr inbounds i32, i32* %1, i64 1
%13 = load i32, i32* %12, align 4
%14 = getelementptr inbounds i32, i32* %0, i64 1
%15 = load i32, i32* %14, align 4
%16 = getelementptr inbounds i32, i32* %1, i64 5
%17 = load i32, i32* %16, align 4
%18 = getelementptr inbounds i32, i32* %0, i64 5
%19 = load i32, i32* %18, align 4
%20 = getelementptr inbounds i32, i32* %1, i64 2
%21 = load i32, i32* %20, align 4
%22 = getelementptr inbounds i32, i32* %0, i64 2
%23 = load i32, i32* %22, align 4
%24 = getelementptr inbounds i32, i32* %1, i64 6
%25 = load i32, i32* %24, align 4
%26 = getelementptr inbounds i32, i32* %0, i64 6
%27 = load i32, i32* %26, align 4
%28 = getelementptr inbounds i32, i32* %1, i64 3
%29 = load i32, i32* %28, align 4
%30 = getelementptr inbounds i32, i32* %0, i64 3
%31 = load i32, i32* %30, align 4
%32 = getelementptr inbounds i32, i32* %1, i64 7
%33 = load i32, i32* %32, align 4
%34 = getelementptr inbounds i32, i32* %0, i64 7
%35 = load i32, i32* %34, align 4
%36 = sub i32 %33, %31
%37 = sub i32 %36, %35
%38 = add i32 %37, %29
%39 = sub i32 %25, %23
%40 = sub i32 %39, %27
%41 = add i32 %40, %21
%42 = sub i32 %17, %15
%43 = sub i32 %42, %19
%44 = add i32 %43, %13
%45 = sub i32 %9, %7
%46 = sub i32 %45, %11
%47 = add i32 %46, %5
%48 = getelementptr inbounds i32, i32* %2, i64 0
%49 = add i32 %41, %38
%50 = add i32 %49, %47
%51 = add i32 %50, %44
store i32 %51, i32* %48, align 4
%52 = getelementptr inbounds i32, i32* %2, i64 2
%53 = add i32 %47, %44
%54 = sub i32 %53, %38
%55 = sub i32 %54, %41
store i32 %55, i32* %52, align 4
%56 = getelementptr inbounds i32, i32* %2, i64 1
%57 = add i32 %47, %41
%58 = sub i32 %57, %44
%59 = sub i32 %58, %38
store i32 %59, i32* %56, align 4
%60 = getelementptr inbounds i32, i32* %2, i64 3
%61 = sub i32 %38, %44
%62 = sub i32 %61, %41
%63 = add i32 %62, %47
store i32 %63, i32* %60, align 4
ret void

}

opt -slp-vectorizer -dce -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -S

After the patch SLP produced more shufflevector instructions then before:

%9 = load <4 x i32>, <4 x i32>* %8, align 4
%shuffle2 = shufflevector <4 x i32> %9, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
%10 = bitcast i32* %5 to <4 x i32>*
%11 = load <4 x i32>, <4 x i32>* %10, align 4
%12 = bitcast i32* %6 to <4 x i32>*
%13 = load <4 x i32>, <4 x i32>* %12, align 4
%14 = bitcast i32* %7 to <4 x i32>*
%15 = load <4 x i32>, <4 x i32>* %14, align 4
%shuffle1 = shufflevector <4 x i32> %15, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
%16 = sub <4 x i32> %13, %11
%shuffle = shufflevector <4 x i32> %16, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
%17 = sub <4 x i32> %shuffle, %shuffle1
%18 = add <4 x i32> %17, %shuffle2

instcombine pass then optimizes these shuffles but the question is whether SLP should rely on that? Is it expected or considered as a regression in SLP vectorizer?

In D116740#3308394, @vdmitrie wrote:

I have a question wrt this patch. Consider this test case:
define dso_local void @test(i32* noalias nocapture readonly %0, i32* noalias nocapture readonly %1, i32* noalias nocapture %2) {

%4 = getelementptr inbounds i32, i32* %1, i64 0
%5 = load i32, i32* %4, align 4
%6 = getelementptr inbounds i32, i32* %0, i64 0
%7 = load i32, i32* %6, align 4
%8 = getelementptr inbounds i32, i32* %1, i64 4
%9 = load i32, i32* %8, align 4
%10 = getelementptr inbounds i32, i32* %0, i64 4
%11 = load i32, i32* %10, align 4
%12 = getelementptr inbounds i32, i32* %1, i64 1
%13 = load i32, i32* %12, align 4
%14 = getelementptr inbounds i32, i32* %0, i64 1
%15 = load i32, i32* %14, align 4
%16 = getelementptr inbounds i32, i32* %1, i64 5
%17 = load i32, i32* %16, align 4
%18 = getelementptr inbounds i32, i32* %0, i64 5
%19 = load i32, i32* %18, align 4
%20 = getelementptr inbounds i32, i32* %1, i64 2
%21 = load i32, i32* %20, align 4
%22 = getelementptr inbounds i32, i32* %0, i64 2
%23 = load i32, i32* %22, align 4
%24 = getelementptr inbounds i32, i32* %1, i64 6
%25 = load i32, i32* %24, align 4
%26 = getelementptr inbounds i32, i32* %0, i64 6
%27 = load i32, i32* %26, align 4
%28 = getelementptr inbounds i32, i32* %1, i64 3
%29 = load i32, i32* %28, align 4
%30 = getelementptr inbounds i32, i32* %0, i64 3
%31 = load i32, i32* %30, align 4
%32 = getelementptr inbounds i32, i32* %1, i64 7
%33 = load i32, i32* %32, align 4
%34 = getelementptr inbounds i32, i32* %0, i64 7
%35 = load i32, i32* %34, align 4
%36 = sub i32 %33, %31
%37 = sub i32 %36, %35
%38 = add i32 %37, %29
%39 = sub i32 %25, %23
%40 = sub i32 %39, %27
%41 = add i32 %40, %21
%42 = sub i32 %17, %15
%43 = sub i32 %42, %19
%44 = add i32 %43, %13
%45 = sub i32 %9, %7
%46 = sub i32 %45, %11
%47 = add i32 %46, %5
%48 = getelementptr inbounds i32, i32* %2, i64 0
%49 = add i32 %41, %38
%50 = add i32 %49, %47
%51 = add i32 %50, %44
store i32 %51, i32* %48, align 4
%52 = getelementptr inbounds i32, i32* %2, i64 2
%53 = add i32 %47, %44
%54 = sub i32 %53, %38
%55 = sub i32 %54, %41
store i32 %55, i32* %52, align 4
%56 = getelementptr inbounds i32, i32* %2, i64 1
%57 = add i32 %47, %41
%58 = sub i32 %57, %44
%59 = sub i32 %58, %38
store i32 %59, i32* %56, align 4
%60 = getelementptr inbounds i32, i32* %2, i64 3
%61 = sub i32 %38, %44
%62 = sub i32 %61, %41
%63 = add i32 %62, %47
store i32 %63, i32* %60, align 4
ret void

}

opt -slp-vectorizer -dce -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -S

After the patch SLP produced more shufflevector instructions then before:

%9 = load <4 x i32>, <4 x i32>* %8, align 4
%shuffle2 = shufflevector <4 x i32> %9, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
%10 = bitcast i32* %5 to <4 x i32>*
%11 = load <4 x i32>, <4 x i32>* %10, align 4
%12 = bitcast i32* %6 to <4 x i32>*
%13 = load <4 x i32>, <4 x i32>* %12, align 4
%14 = bitcast i32* %7 to <4 x i32>*
%15 = load <4 x i32>, <4 x i32>* %14, align 4
%shuffle1 = shufflevector <4 x i32> %15, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
%16 = sub <4 x i32> %13, %11
%shuffle = shufflevector <4 x i32> %16, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
%17 = sub <4 x i32> %shuffle, %shuffle1
%18 = add <4 x i32> %17, %shuffle2

instcombine pass then optimizes these shuffles but the question is whether SLP should rely on that? Is it expected or considered as a regression in SLP vectorizer?

No, it is a regression, need to investigate it.

I believe https://reviews.llvm.org/D120492 supposed to fix the issue I reported earlier, but that did not happen. The test I've sent earlier is a simplified one. I just slightly modified it to show misbehaving of the reordering.

This is okay:
define void @test(i32* %arg, i32* %arg1, i32* %arg2) {
bb:

%i3 = load i32, i32* %arg, align 4
%s3 = add i32 %i3, %i3
%i4 = getelementptr inbounds i32, i32* %arg, i64 4
%i5 = load i32, i32* %i4, align 4
%s5 = add i32 %i5, %i5
%i8 = getelementptr inbounds i32, i32* %arg, i64 1
%i9 = load i32, i32* %i8, align 4
%s9 = add i32 %i9, %i9
%i10 = getelementptr inbounds i32, i32* %arg, i64 5
%i11 = load i32, i32* %i10, align 4
%s11 = add i32 %i11, %i11
%i14 = getelementptr inbounds i32, i32* %arg, i64 2
%i15 = load i32, i32* %i14, align 4
%s15 = add i32 %i15, %i15
%i16 = getelementptr inbounds i32, i32* %arg, i64 6
%i17 = load i32, i32* %i16, align 4
%s17 = add i32 %i17, %i17
%i20 = getelementptr inbounds i32, i32* %arg, i64 3
%i21 = load i32, i32* %i20, align 4
%s21 = add i32 %i21, %i21
%i22 = getelementptr inbounds i32, i32* %arg, i64 7
%i23 = load i32, i32* %i22, align 4
%s23 = add i32 %i23, %i23

%i1 = load i32, i32* %arg1, align 4
%i6 = getelementptr inbounds i32, i32* %arg1, i64 1
%i7 = load i32, i32* %i6, align 4
%i12 = getelementptr inbounds i32, i32* %arg1, i64 2
%i13 = load i32, i32* %i12, align 4
%i18 = getelementptr inbounds i32, i32* %arg1, i64 3
%i19 = load i32, i32* %i18, align 4

%i24 = sub i32 0, %s21
%i25 = sub i32 %i24, %s23
%i26 = add i32 %i25, %i19
%i27 = sub i32 undef, %s15
%i28 = sub i32 %i27, %s17
%i29 = add i32 %i28, %i13
%i30 = sub i32 0, %s9
%i31 = sub i32 %i30, %s11
%i32 = add i32 %i31, %i7
%i33 = sub i32 0, %s3
%i34 = sub i32 %i33, %s5
%i35 = add i32 %i34, %i1
%i36 = add i32 %i29, 1
%i37 = add i32 %i36, 0
%i38 = add i32 %i37, 0
store i32 %i38, i32* %arg2, align 4
%i39 = getelementptr inbounds i32, i32* %arg2, i64 2
%i40 = add i32 0, %i32
%i41 = sub i32 %i40, 0
%i42 = sub i32 %i41, 0
store i32 %i42, i32* %i39, align 4
%i43 = getelementptr inbounds i32, i32* %arg2, i64 1
%i44 = add i32 %i35, 0
%i45 = sub i32 %i44, 0
%i46 = sub i32 %i45, 0
store i32 %i46, i32* %i43, align 4
%i47 = getelementptr inbounds i32, i32* %arg2, i64 3
%i48 = sub i32 %i26, 0
%i49 = sub i32 %i48, 0
%i50 = add i32 %i49, 0
store i32 %i50, i32* %i47, align 4
ret void

}

merely because of this if statement:

if (UserTE->UserTreeIndices.size() != 1)
  break;

effectively returning to behavior prior to the patch.

But this test still produce all these extra shuffles:
define void @test(i32* %arg, i32* %arg1, i32* %arg2) {
bb:

%i3 = load i32, i32* %arg, align 4
%s3 = add i32 %i3, 6
%i4 = getelementptr inbounds i32, i32* %arg, i64 4
%i5 = load i32, i32* %i4, align 4
%s5 = add i32 %i5, 6
%i8 = getelementptr inbounds i32, i32* %arg, i64 1
%i9 = load i32, i32* %i8, align 4
%s9 = add i32 %i9, 6
%i10 = getelementptr inbounds i32, i32* %arg, i64 5
%i11 = load i32, i32* %i10, align 4
%s11 = add i32 %i11, 6
%i14 = getelementptr inbounds i32, i32* %arg, i64 2
%i15 = load i32, i32* %i14, align 4
%s15 = add i32 %i15, 6
%i16 = getelementptr inbounds i32, i32* %arg, i64 6
%i17 = load i32, i32* %i16, align 4
%s17 = add i32 %i17, 6
%i20 = getelementptr inbounds i32, i32* %arg, i64 3
%i21 = load i32, i32* %i20, align 4
%s21 = add i32 %i21, 6
%i22 = getelementptr inbounds i32, i32* %arg, i64 7
%i23 = load i32, i32* %i22, align 4
%s23 = add i32 %i23, 6

%i1 = load i32, i32* %arg1, align 4
%i6 = getelementptr inbounds i32, i32* %arg1, i64 1
%i7 = load i32, i32* %i6, align 4
%i12 = getelementptr inbounds i32, i32* %arg1, i64 2
%i13 = load i32, i32* %i12, align 4
%i18 = getelementptr inbounds i32, i32* %arg1, i64 3
%i19 = load i32, i32* %i18, align 4

%i24 = sub i32 0, %s21
%i25 = sub i32 %i24, %s23
%i26 = add i32 %i25, %i19
%i27 = sub i32 undef, %s15
%i28 = sub i32 %i27, %s17
%i29 = add i32 %i28, %i13
%i30 = sub i32 0, %s9
%i31 = sub i32 %i30, %s11
%i32 = add i32 %i31, %i7
%i33 = sub i32 0, %s3
%i34 = sub i32 %i33, %s5
%i35 = add i32 %i34, %i1
%i36 = add i32 %i29, 1
%i37 = add i32 %i36, 0
%i38 = add i32 %i37, 0
store i32 %i38, i32* %arg2, align 4
%i39 = getelementptr inbounds i32, i32* %arg2, i64 2
%i40 = add i32 0, %i32
%i41 = sub i32 %i40, 0
%i42 = sub i32 %i41, 0
store i32 %i42, i32* %i39, align 4
%i43 = getelementptr inbounds i32, i32* %arg2, i64 1
%i44 = add i32 %i35, 0
%i45 = sub i32 %i44, 0
%i46 = sub i32 %i45, 0
store i32 %i46, i32* %i43, align 4
%i47 = getelementptr inbounds i32, i32* %arg2, i64 3
%i48 = sub i32 %i26, 0
%i49 = sub i32 %i48, 0
%i50 = add i32 %i49, 0
store i32 %i50, i32* %i47, align 4
ret void

}

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
3073	this condition is always false because if UserTE->UserTreeIndices.size() != 1 we exit loop at 3067

Herald added a project: Restricted Project. · View Herald TranscriptApr 1 2022, 3:18 PM

vdmitrie added inline comments.Apr 1 2022, 4:53 PM

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
3245–3250	Problem is seems to be here when VL is all constants. Operands are still can be reordered with no extra cost.

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Vectorize/

SLPVectorizer.cpp

23 lines

test/

Transforms/

SLPVectorizer/

AArch64/

transpose-inseltpoison.ll

11 lines

transpose.ll

11 lines

X86/

vectorize-reordered-list.ll

5 lines

Diff 397945

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,047 Lines • ▼ Show 20 Lines	void BoUpSLP::reorderTopToBottom() {
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;		DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
// Find all reorderable nodes with the given VF.		// Find all reorderable nodes with the given VF.
// Currently the are vectorized stores,loads,extracts + some gathering of		// Currently the are vectorized stores,loads,extracts + some gathering of
// extracts.		// extracts.
for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](		for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
const std::unique_ptr<TreeEntry> &TE) {		const std::unique_ptr<TreeEntry> &TE) {
if (Optional<OrdersType> CurrentOrder =		if (Optional<OrdersType> CurrentOrder =
getReorderingData(TE.get(), /TopToBottom=*/true)) {		getReorderingData(TE.get(), /TopToBottom=*/true)) {
		// Do not include ordering for nodes used in the alt opcode vectorization,
		// better to reorder them during bottom-to-top stage. If follow the order
		vporpoUnsubmitted Not Done Reply Inline Actions Please explain in the comment why this is better. vporpo: Please explain in the comment why this is better.
		// here, it causes reordering of the whole graph though actually it is
		// profitable just to reorder the subgraph that starts from the alternate
		// opcode vectorization node. Such nodes already end-up with the shuffle
		// instruction and it is just enough to change this shuffle rather than
		// rotate the scalars for the whole graph.
		unsigned Cnt = 0;
		const TreeEntry *UserTE = TE.get();
		while (UserTE && Cnt < RecursionMaxDepth) {
		if (UserTE->UserTreeIndices.size() != 1)
		break;
		if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
		return EI.UserTE->State == TreeEntry::Vectorize &&
		EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
		}))
		return;
		if (UserTE->UserTreeIndices.empty())
		vdmitrieUnsubmitted Not Done Reply Inline Actions this condition is always false because if UserTE->UserTreeIndices.size() != 1 we exit loop at 3067 vdmitrie: this condition is always false because if UserTE->UserTreeIndices.size() != 1 we exit loop at…
		UserTE = nullptr;
		else
		UserTE = UserTE->UserTreeIndices.back().UserTE;
		++Cnt;
		}
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());		VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
if (TE->State != TreeEntry::Vectorize)		if (TE->State != TreeEntry::Vectorize)
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);		GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
}		}
});		});

// Reorder the graph nodes according to their vectorization factor.		// Reorder the graph nodes according to their vectorization factor.
for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;		for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;
▲ Show 20 Lines • Show All 150 Lines • ▼ Show 20 Lines	auto &&CheckOperands =
return TE;		return TE;
});		});
if (It != VL.end() && TE->isSame(VL))		if (It != VL.end() && TE->isSame(VL))
return false;		return false;
TreeEntry *Gather = nullptr;		TreeEntry *Gather = nullptr;
if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {		if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&		assert(TE->State != TreeEntry::Vectorize &&
"Only non-vectorized nodes are expected.");		"Only non-vectorized nodes are expected.");
if (TE->isSame(VL)) {		if (TE->isSame(VL)) {
Gather = TE;		Gather = TE;
return true;		return true;
}		}
return false;		return false;
}) > 1)		}) > 1)
		vdmitrieUnsubmitted Not Done Reply Inline Actions Problem is seems to be here when VL is all constants. Operands are still can be reordered with no extra cost. vdmitrie: Problem is seems to be here when VL is all constants. Operands are still can be reordered with…
return false;		return false;
if (Gather)		if (Gather)
GatherOps.push_back(Gather);		GatherOps.push_back(Gather);
}		}
return true;		return true;
};		};
// 1. Propagate order to the graph nodes, which use only reordered nodes.		// 1. Propagate order to the graph nodes, which use only reordered nodes.
// I.e., if the node has operands, that are reordered, try to make at least		// I.e., if the node has operands, that are reordered, try to make at least
▲ Show 20 Lines • Show All 7,000 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll

Show First 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	;
store i64 %tmp2.1, i64* %c.1, align 8		store i64 %tmp2.1, i64* %c.1, align 8
ret void		ret void
}		}

define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {		define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @build_vec_v4i32(		; CHECK-LABEL: @build_vec_v4i32(
; CHECK-NEXT: [[TMP1:%.]] = add <4 x i32> [[V0:%.]], [[V1:%.*]]		; CHECK-NEXT: [[TMP1:%.]] = add <4 x i32> [[V0:%.]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]		; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 5, i32 0, i32 3, i32 6>		; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>		; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]		; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>		; CHECK-NEXT: ret <4 x i32> [[TMP5]]
; CHECK-NEXT: ret <4 x i32> [[TMP6]]
;		;
%v0.0 = extractelement <4 x i32> %v0, i32 0		%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1		%v0.1 = extractelement <4 x i32> %v0, i32 1
%v0.2 = extractelement <4 x i32> %v0, i32 2		%v0.2 = extractelement <4 x i32> %v0, i32 2
%v0.3 = extractelement <4 x i32> %v0, i32 3		%v0.3 = extractelement <4 x i32> %v0, i32 3
%v1.0 = extractelement <4 x i32> %v1, i32 0		%v1.0 = extractelement <4 x i32> %v1, i32 0
%v1.1 = extractelement <4 x i32> %v1, i32 1		%v1.1 = extractelement <4 x i32> %v1, i32 1
%v1.2 = extractelement <4 x i32> %v1, i32 2		%v1.2 = extractelement <4 x i32> %v1, i32 2
▲ Show 20 Lines • Show All 119 Lines • ▼ Show 20 Lines	;
%tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3		%tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
ret <4 x i32> %tmp3.3		ret <4 x i32> %tmp3.3
}		}

define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {		define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @reduction_v4i32(		; CHECK-LABEL: @reduction_v4i32(
; CHECK-NEXT: [[TMP1:%.]] = sub <4 x i32> [[V0:%.]], [[V1:%.*]]		; CHECK-NEXT: [[TMP1:%.]] = sub <4 x i32> [[V0:%.]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]		; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 5, i32 0, i32 7, i32 2>		; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>		; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]		; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>		; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>		; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>		; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]		; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]		; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])		; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
; CHECK-NEXT: ret i32 [[TMP11]]		; CHECK-NEXT: ret i32 [[TMP11]]
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll

Show First 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	;
store i64 %tmp2.1, i64* %c.1, align 8		store i64 %tmp2.1, i64* %c.1, align 8
ret void		ret void
}		}

define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {		define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @build_vec_v4i32(		; CHECK-LABEL: @build_vec_v4i32(
; CHECK-NEXT: [[TMP1:%.]] = add <4 x i32> [[V0:%.]], [[V1:%.*]]		; CHECK-NEXT: [[TMP1:%.]] = add <4 x i32> [[V0:%.]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]		; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 5, i32 0, i32 3, i32 6>		; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>		; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]		; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>		; CHECK-NEXT: ret <4 x i32> [[TMP5]]
; CHECK-NEXT: ret <4 x i32> [[TMP6]]
;		;
%v0.0 = extractelement <4 x i32> %v0, i32 0		%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1		%v0.1 = extractelement <4 x i32> %v0, i32 1
%v0.2 = extractelement <4 x i32> %v0, i32 2		%v0.2 = extractelement <4 x i32> %v0, i32 2
%v0.3 = extractelement <4 x i32> %v0, i32 3		%v0.3 = extractelement <4 x i32> %v0, i32 3
%v1.0 = extractelement <4 x i32> %v1, i32 0		%v1.0 = extractelement <4 x i32> %v1, i32 0
%v1.1 = extractelement <4 x i32> %v1, i32 1		%v1.1 = extractelement <4 x i32> %v1, i32 1
%v1.2 = extractelement <4 x i32> %v1, i32 2		%v1.2 = extractelement <4 x i32> %v1, i32 2
▲ Show 20 Lines • Show All 119 Lines • ▼ Show 20 Lines	;
%tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3		%tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
ret <4 x i32> %tmp3.3		ret <4 x i32> %tmp3.3
}		}

define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {		define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @reduction_v4i32(		; CHECK-LABEL: @reduction_v4i32(
; CHECK-NEXT: [[TMP1:%.]] = sub <4 x i32> [[V0:%.]], [[V1:%.*]]		; CHECK-NEXT: [[TMP1:%.]] = sub <4 x i32> [[V0:%.]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]		; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 5, i32 0, i32 7, i32 2>		; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>		; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]		; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>		; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>		; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>		; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]		; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]		; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])		; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
; CHECK-NEXT: ret i32 [[TMP11]]		; CHECK-NEXT: ret i32 [[TMP11]]
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -S --slp-vectorizer -mtriple=x86_64-unknown %s \| FileCheck %s			; RUN: opt -S --slp-vectorizer -mtriple=x86_64-unknown %s \| FileCheck %s

	define void @test(double* %isec) {			define void @test(double* %isec) {
	; CHECK-LABEL: @test(			; CHECK-LABEL: @test(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[ARRAYIDX5:%.]] = getelementptr inbounds double, double [[ISEC:%.*]], i64 1			; CHECK-NEXT: [[ARRAYIDX5:%.]] = getelementptr inbounds double, double [[ISEC:%.*]], i64 1
	; CHECK-NEXT: [[ARRAYIDX10:%.]] = getelementptr inbounds double, double [[ISEC]], i64 0			; CHECK-NEXT: [[ARRAYIDX10:%.]] = getelementptr inbounds double, double [[ISEC]], i64 0
	; CHECK-NEXT: [[TMP0:%.]] = bitcast double [[ARRAYIDX10]] to <2 x double>*			; CHECK-NEXT: [[TMP0:%.]] = bitcast double [[ARRAYIDX10]] to <2 x double>*
	; CHECK-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> [[TMP0]], align 8			; CHECK-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> [[TMP0]], align 8
	; CHECK-NEXT: [[ARRAYIDX3:%.]] = getelementptr inbounds double, double [[ISEC]], i64 3			; CHECK-NEXT: [[ARRAYIDX3:%.]] = getelementptr inbounds double, double [[ISEC]], i64 3
	; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds double, double [[ISEC]], i64 2			; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds double, double [[ISEC]], i64 2
	; CHECK-NEXT: [[TMP2:%.]] = bitcast double [[ARRAYIDX2]] to <2 x double>*			; CHECK-NEXT: [[TMP2:%.]] = bitcast double [[ARRAYIDX2]] to <2 x double>*
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> [[TMP2]], align 8			; CHECK-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> [[TMP2]], align 8
	; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]			; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]			; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
	; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 2, i32 1>			; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
	; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
	; CHECK-NEXT: [[TMP7:%.]] = bitcast double [[ARRAYIDX10]] to <2 x double>*			; CHECK-NEXT: [[TMP7:%.]] = bitcast double [[ARRAYIDX10]] to <2 x double>*
	; CHECK-NEXT: store <2 x double> [[SHUFFLE]], <2 x double>* [[TMP7]], align 8			; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	entry:			entry:
	%arrayidx5 = getelementptr inbounds double, double* %isec, i64 1			%arrayidx5 = getelementptr inbounds double, double* %isec, i64 1
	%0 = load double, double* %arrayidx5, align 8			%0 = load double, double* %arrayidx5, align 8
	%arrayidx10 = getelementptr inbounds double, double* %isec, i64 0			%arrayidx10 = getelementptr inbounds double, double* %isec, i64 0
	%1 = load double, double* %arrayidx10, align 8			%1 = load double, double* %arrayidx10, align 8
	%arrayidx3 = getelementptr inbounds double, double* %isec, i64 3			%arrayidx3 = getelementptr inbounds double, double* %isec, i64 3
	Show All 9 Lines