This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Transforms/Vectorize/
-
Transforms/
-
Vectorize/
-
LoopVectorize.cpp
-
test/Transforms/LoopVectorize/AArch64/
-
Transforms/
-
LoopVectorize/
-
AArch64/
-
pr64986.ll

Differential D158988

[LV] Choose the wider VF where they have same cost
AbandonedPublic

Authored by Allen on Aug 28 2023, 7:17 AM.

Download Raw Diff

Details

Reviewers

sdesmalen
dmgreen
bmahjour
david-arm
ctetreau
fhahn

Summary

sometimes, different VF will get same cost, and prefer to
the wider VF to improve the parallelism degree

Fixes https://github.com/llvm/llvm-project/issues/64986

Diff Detail

Unit TestsFailed

	Time	Test
	60,060 ms	x64 debian > MLIR.Examples/standalone::test.toy

Event Timeline

Allen created this revision.Aug 28 2023, 7:17 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 28 2023, 7:17 AM

Herald added subscribers: artagnon, hiraditya. · View Herald Transcript

Allen requested review of this revision.Aug 28 2023, 7:17 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 28 2023, 7:17 AM

Herald added subscribers: llvm-commits, wangpc. · View Herald Transcript

Harbormaster completed remote builds in B255217: Diff 553923.Aug 28 2023, 8:14 AM

ping ?

Hi @Allen,
I recently submitted D157628, which lowers the cost of extends when they can be folded into a urhadd or srhadd instruction.
The tests I added are similar the one in this patch, so I was wondering if D157628 may have fixed the same issue as your changes?

In D158988#4625738, @kmclaughlin wrote:

Hi @Allen,
I recently submitted D157628, which lowers the cost of extends when they can be folded into a urhadd or srhadd instruction.
The tests I added are similar the one in this patch, so I was wondering if D157628 may have fixed the same issue as your changes?

Thanks your information, I tried your PR and find it only affected the fixed length VF, so it will still prefer the vscale x 8 with your PR.

In D158988#4630317, @Allen wrote:

In D158988#4625738, @kmclaughlin wrote:

Hi @Allen,
I recently submitted D157628, which lowers the cost of extends when they can be folded into a urhadd or srhadd instruction.
The tests I added are similar the one in this patch, so I was wondering if D157628 may have fixed the same issue as your changes?

Thanks your information, I tried your PR and find it only affected the fixed length VF, so it will still prefer the vscale x 8 with your PR.

Adding @fhahn as a reviewer.

Which CPU are you targeting and how did you build your example? I believe that @kmclaughlin used D157628 to show that for certain loops in x264 when tail-folding we choose a higher VF for some SVE2-enabled CPUs due to the lower cost of the zext and sext instructions. Regardless of that, I'm still a bit worried by this patch because I believe it is a very significant change that will affect all targets across a wide range of CPUs. I'm not saying this change is wrong, but can you describe in the commit message what benchmarks you have run and for what targets?

Herald added a subscriber: StephenFan. · View Herald TranscriptAug 31 2023, 1:01 AM

Also, it's quite possible that the cost model for the loop you're interested in needs improving further. For example, what if you changed the cost of the trunc in the loop to zero - would that also solve your problem? I think @kmclaughlin's patch only changed the costs of the zext and sext instructions.

Oh, sorry. I missed to use -mattr=+sve to test this case, it should be -mattr=+sve2, so it only change the fix length type vector.

opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize  pr64986.ll

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Vectorize/

LoopVectorize.cpp

7 lines

test/

Transforms/

LoopVectorize/

AArch64/

pr64986.ll

57 lines

Diff 553923

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

	Show First 20 Lines • Show All 91 Lines • ▼ Show 20 Lines

	// Assume vscale may be larger than 1 (or the value being tuned for),			// Assume vscale may be larger than 1 (or the value being tuned for),
	// so that scalable vectorization is slightly favorable over fixed-width			// so that scalable vectorization is slightly favorable over fixed-width
	// vectorization.			// vectorization.
	if (A.Width.isScalable() && !B.Width.isScalable())			if (A.Width.isScalable() && !B.Width.isScalable())
	return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);			return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);

	// To avoid the need for FP division:			// To avoid the need for FP division:
	// (CostA / A.Width) < (CostB / B.Width)			// (CostA / A.Width) <= (CostB / B.Width)
	// <=> (CostA * B.Width) < (CostB * A.Width)			// <=> (CostA * B.Width) <= (CostB * A.Width)
	return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);			// Choose the wider VF where they have same cost.
				return (CostA * EstimatedWidthB) <= (CostB * EstimatedWidthA);
	}			}

	static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,			static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
	OptimizationRemarkEmitter *ORE,			OptimizationRemarkEmitter *ORE,
	Loop *TheLoop) {			Loop *TheLoop) {
	if (InvalidCosts.empty())			if (InvalidCosts.empty())
	return;			return;

	▲ Show 20 Lines • Show All 91 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/pr64986.ll

This file was added.

				; RUN: opt -mtriple=aarch64-unknown-linux-gnu -mattr=+sve -passes=loop-vectorize -pass-remarks=loop-vectorize -disable-output < %s 2>&1 \| FileCheck %s

				target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
				target triple = "aarch64-unknown-linux-gnu"

				; prefer vscale x 16 when vscale x 16 and vscale x 8 have same cost.
				; CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: vscale x 16, interleaved count: 2)

				define void @pixel_avg(ptr noalias nocapture %dst, i32 %i_dst_stride, ptr noalias nocapture %src1, i32 %i_src1_stride, ptr noalias nocapture %src2, i32 %i_src2_stride, i32 %i_width, i32 %i_height) {
				entry:
				%cmp29 = icmp sgt i32 %i_height, 0
				br i1 %cmp29, label %for.preheader.lr.ph, label %cleanup

				for.preheader.lr.ph: ; preds = %entry
				%cmp227 = icmp sgt i32 %i_width, 0
				%idx.ext = sext i32 %i_dst_stride to i64
				%idx.ext12 = sext i32 %i_src1_stride to i64
				%idx.ext14 = sext i32 %i_src2_stride to i64
				%wide.trip.count = zext i32 %i_width to i64
				br i1 %cmp227, label %for.preheader, label %cleanup

				for.preheader: ; preds = %for.preheader.lr.ph, %for.latch
				%y.033.us = phi i32 [ %inc17.us, %for.latch ], [ 0, %for.preheader.lr.ph ]
				%dst.addr.032.us = phi ptr [ %add.ptr.us, %for.latch ], [ %dst, %for.preheader.lr.ph ]
				%src1.addr.031.us = phi ptr [ %add.ptr13.us, %for.latch ], [ %src1, %for.preheader.lr.ph ]
				%src2.addr.030.us = phi ptr [ %add.ptr15.us, %for.latch ], [ %src2, %for.preheader.lr.ph ]
				br label %for.body

				for.body: ; preds = %for.preheader, %for.body
				%indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %for.body ]
				%arrayidx.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %indvars.iv
				%0 = load i8, ptr %arrayidx.us, align 1
				%conv.us = zext i8 %0 to i16
				%arrayidx6.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %indvars.iv
				%1 = load i8, ptr %arrayidx6.us, align 1
				%conv7.us = zext i8 %1 to i16
				%add.us = add nuw nsw i16 %conv.us, 1
				%add8.us = add nuw nsw i16 %add.us, %conv7.us
				%shr.us = lshr i16 %add8.us, 1
				%conv9.us = trunc i16 %shr.us to i8
				%arrayidx11.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %indvars.iv
				store i8 %conv9.us, ptr %arrayidx11.us, align 1
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
				br i1 %exitcond.not, label %for.latch, label %for.body

				for.latch: ; preds = %for.body
				%add.ptr.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %idx.ext
				%add.ptr13.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %idx.ext12
				%add.ptr15.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %idx.ext14
				%inc17.us = add nuw nsw i32 %y.033.us, 1
				%exitcond36.not = icmp eq i32 %inc17.us, %i_height
				br i1 %exitcond36.not, label %cleanup, label %for.preheader

				cleanup: ; preds = %for.latch, %for.preheader.lr.ph, %entry
				ret void
				}