This is an archive of the discontinued LLVM Phabricator instance.

[LV] Improve register pressure estimation if MaxLocalUsers is zero
Needs ReviewPublic

Authored by yrouban on Apr 3 2023, 3:48 AM.

Download Raw Diff

Details

Reviewers

• wuzish
hfinkel
reames
craig.topper
fhahn

Summary

Do not limit LoopVectorize interleave count using MaxLocalUsers in case MaxLocalUsers is zero.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

yrouban created this revision.Apr 3 2023, 3:48 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 3 2023, 3:48 AM

Herald added subscribers: StephenFan, hiraditya. · View Herald Transcript

yrouban requested review of this revision.Apr 3 2023, 3:48 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 3 2023, 3:48 AM

Herald added a subscriber: • pcwang-thead. · View Herald Transcript

Harbormaster completed remote builds in B223318: Diff 510447.Apr 3 2023, 4:42 AM

yrouban retitled this revision from [LV] Improve register pressure estimation for MaxLocalUsers is zero to [LV] Improve register pressure estimation if MaxLocalUsers is zero.Apr 4 2023, 7:59 AM

Do you have any performance data motivating the change and ruling out any regressions?

llvm/test/Transforms/LoopVectorize/X86/interleave-count.ll
38	Please update the test to use opaque pointers. Also, it would be good to put up a patch to just add the test separately and then only include the changes caused by the patch in the diff

yrouban mentioned this in D147588: [LV] Add a test for register pressure estimation. NFC.Apr 4 2023, 8:49 PM

In D147434#4243671, @fhahn wrote:

Do you have any performance data motivating the change and ruling out any regressions?

No. I investigated an AVX512 memset code generated by a non-llvm compiler with vectorized move instruction using zmm register and unrolled as if it had interleave count 16. Trying to achieve the same result with LoopVectorize I have found this nit. This is not a problem as long as the number of vector registers is big enough, that is even if decremented and bit-floored but still bigger than the other interleave count limits (e.g. X86TTIImpl::getMaxInterleaveFactor() returns 4 for AVX).

llvm/test/Transforms/LoopVectorize/X86/interleave-count.ll
38	Done. See D147588.

extracted the test to a separate patch.

Harbormaster completed remote builds in B223723: Diff 510993.Apr 4 2023, 9:18 PM

just rebased over the updated D147588

Harbormaster completed remote builds in B231052: Diff 520930.May 10 2023, 1:16 AM

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Vectorize/

LoopVectorize.cpp

16 lines

test/

Transforms/

LoopVectorize/

X86/

interleave-count.ll

14 lines

Diff 520930

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,808 Lines • ▼ Show 20 Lines	if (VF.isScalar()) {
if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)		if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
TargetNumRegisters = ForceTargetNumVectorRegs;		TargetNumRegisters = ForceTargetNumVectorRegs;
}		}
unsigned MaxLocalUsers = pair.second;		unsigned MaxLocalUsers = pair.second;
unsigned LoopInvariantRegs = 0;		unsigned LoopInvariantRegs = 0;
if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())		if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
LoopInvariantRegs = R.LoopInvariantRegs[pair.first];		LoopInvariantRegs = R.LoopInvariantRegs[pair.first];

unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
MaxLocalUsers);
// Don't count the induction variable as interleaved.
if (EnableIndVarRegisterHeur) {		if (EnableIndVarRegisterHeur) {
TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /		// Treat the induction variable as a LoopInvariantReg.
std::max(1U, (MaxLocalUsers - 1)));		assert(MaxLocalUsers);
}		--MaxLocalUsers;
		++LoopInvariantRegs;
		}
		if (MaxLocalUsers > 0) {
		unsigned TmpIC = llvm::bit_floor(
		(TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
IC = std::min(IC, TmpIC);		IC = std::min(IC, TmpIC);
}		}
		}

// Clamp the interleave ranges to reasonable counts.		// Clamp the interleave ranges to reasonable counts.
unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);		unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);

// Check if the user has overridden the max.		// Check if the user has overridden the max.
if (VF.isScalar()) {		if (VF.isScalar()) {
if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)		if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;		MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
▲ Show 20 Lines • Show All 4,765 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/X86/interleave-count.ll

	; REQUIRES: asserts			; REQUIRES: asserts
	;			;
	; RUN: opt -S -passes=loop-vectorize -debug-only=loop-vectorize \			; RUN: opt -S -passes=loop-vectorize -debug-only=loop-vectorize \
	; RUN: -mtriple=x86_64-pc_linux -mcpu=cascadelake \			; RUN: -mtriple=x86_64-pc_linux -mcpu=cascadelake \
	; RUN: -force-target-max-vector-interleave=16 -force-target-num-vector-regs=16 \			; RUN: -force-target-max-vector-interleave=16 -force-target-num-vector-regs=16 \
	; RUN: %s 2>&1 \| FileCheck %s			; RUN: %s 2>&1 \| FileCheck %s

	; Check that the interleave count is limited by 8 even if there is no			; Check that the interleave count is not limited by 8 if there is no
	; register use except one induction variable.			; register use except one induction variable.
	define void @test(ptr %dst, i64 %size) {			define void @test(ptr %dst, i64 %size) {
	; CHECK-LABEL: LV: Checking a loop in 'test'			; CHECK-LABEL: LV: Checking a loop in 'test'
	; CHECK: LV: IC is 8			; CHECK: LV: IC is 16
	;			;
	; CHECK-LABEL: define void @test			; CHECK-LABEL: define void @test
	;			;
	; Number of @llvm.masked.scatter() calls is 8.			; Number of @llvm.masked.scatter() calls is 16.
	; CHECK: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>			; CHECK: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
	; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>			; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
	; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>			; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
	; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>			; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
	; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>			; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
	; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>			; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
	; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>			; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
	; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>			; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
				; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
				; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
				; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
				; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
				; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
				; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
				; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
				; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
	; CHECK-NOT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>			; CHECK-NOT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double>
	;			;
	entry:			entry:
	%p.end = getelementptr inbounds double, ptr %dst, i64 %size			%p.end = getelementptr inbounds double, ptr %dst, i64 %size
	br label %loop			br label %loop

				fhahnUnsubmitted Done Reply Inline Actions Please update the test to use opaque pointers. Also, it would be good to put up a patch to just add the test separately and then only include the changes caused by the patch in the diff fhahn: Please update the test to use opaque pointers. Also, it would be good to put up a patch to just…
				yroubanAuthorUnsubmitted Done Reply Inline Actions Done. See D147588. yrouban: Done. See D147588.
	loop:			loop:
	%p = phi ptr [%dst, %entry], [%p.next, %loop]			%p = phi ptr [%dst, %entry], [%p.next, %loop]
	store double -1.000000e+00, ptr %p, align 8			store double -1.000000e+00, ptr %p, align 8
	%p.next = getelementptr inbounds double, ptr %p, i64 8			%p.next = getelementptr inbounds double, ptr %p, i64 8
	%done = icmp eq ptr %p.next, %p.end			%done = icmp eq ptr %p.next, %p.end
	br i1 %done, label %exit, label %loop			br i1 %done, label %exit, label %loop

	exit:			exit:
	ret void			ret void
	}			}