This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/
-
IR/
-
IRBuilder.cpp
-
Transforms/Vectorize/
-
Vectorize/
-
LoopVectorize.cpp
-
test/Transforms/LoopVectorize/
-
Transforms/
-
LoopVectorize/
1/2
scalable-loop-invariant-store-unpredicated-body-scalar-tail.ll

Differential D90344

[POC][LoopVectorizer] Allow invariant loads/stores using masked gather/scatter for a scalable VF.
AbandonedPublic

Authored by sdesmalen on Oct 28 2020, 2:09 PM.

Download Raw Diff

Details

Reviewers

ctetreau

Summary

This patch is part of a proof of concept for vectorising a loop using
scalable vectors. The patch is shared for reference and there is no
expectation for this patch to land in the current form.

For fixed-width vectors, the loopvectorizer assumes that certain operations
can be scalarized. For example, loads/stores from uniform pointers without
masking are scalarized, which is not possible for scalable vectors. For
these, use gather/scatter instructions instead until we've found a way to
properly widen these types.

void loop(int N, double *a, double *b) {
  #pragma clang loop vectorize_width(4, scalable)
  for (int i = 0; i < N; i++) {
    a[42] = b[i] + 1.0;   // uses llvm.masked.scatter for the store
  }   
}

Diff Detail

Event Timeline

sdesmalen created this revision.Oct 28 2020, 2:09 PM

Herald added a project: Restricted Project. · View Herald TranscriptOct 28 2020, 2:09 PM

Herald added subscribers: dexonsmith, hiraditya. · View Herald Transcript

sdesmalen requested review of this revision.Oct 28 2020, 2:09 PM

sdesmalen added a parent revision: D90343: [POC][LoopVectorizer] Vectorize a simple loop with a scalable VF..Oct 28 2020, 2:12 PM

Harbormaster completed remote builds in B76811: Diff 301424.Oct 28 2020, 2:38 PM

steleman added a subscriber: steleman.Oct 28 2020, 4:41 PM

khchen added a subscriber: khchen.Oct 28 2020, 5:45 PM

dancgr added a subscriber: dancgr.Nov 3 2020, 9:55 AM

ctetreau added a subscriber: ctetreau.Nov 5 2020, 2:21 PM

ctetreau added inline comments.

llvm/test/Transforms/LoopVectorize/scalable-loop-invariant-store-unpredicated-body-scalar-tail.ll
3	I suppose this is why you don't want to actually merge this currently? What happens if if it's not aarch64?

ctetreau added a reviewer: ctetreau.Nov 5 2020, 2:22 PM

sdesmalen added inline comments.Nov 9 2020, 7:13 AM

llvm/test/Transforms/LoopVectorize/scalable-loop-invariant-store-unpredicated-body-scalar-tail.ll
3	The `REQUIRES: aarch64-registered-target` is actually unnecessary, not sure why I thought this was needed. This patch is probably simple enough to be reviewed as-is. The other POC patches I've split up into smaller NFC patches, but there is little to simplify here.

I have no objections to this, but you should probably get some more eyes on it.

ctetreau resigned from this revision.Feb 1 2021, 9:54 AM

This has since been superseded by other patches.

Revision Contents

Path

Size

llvm/

lib/

IR/

IRBuilder.cpp

18 lines

Transforms/

Vectorize/

LoopVectorize.cpp

22 lines

test/

Transforms/

LoopVectorize/

scalable-loop-invariant-store-unpredicated-body-scalar-tail.ll

68 lines

Diff 301424

llvm/lib/IR/IRBuilder.cpp

	Show First 20 Lines • Show All 522 Lines • ▼ Show 20 Lines
	/// \p Mask - vector of booleans which indicates what vector lanes should			/// \p Mask - vector of booleans which indicates what vector lanes should
	/// be accessed in memory			/// be accessed in memory
	/// \p PassThru - pass-through value that is used to fill the masked-off lanes			/// \p PassThru - pass-through value that is used to fill the masked-off lanes
	/// of the result			/// of the result
	/// \p Name - name of the result variable			/// \p Name - name of the result variable
	CallInst IRBuilderBase::CreateMaskedGather(Value Ptrs, Align Alignment,			CallInst IRBuilderBase::CreateMaskedGather(Value Ptrs, Align Alignment,
	Value Mask, Value PassThru,			Value Mask, Value PassThru,
	const Twine &Name) {			const Twine &Name) {
	auto *PtrsTy = cast<FixedVectorType>(Ptrs->getType());			auto *PtrsTy = cast<VectorType>(Ptrs->getType());
	auto *PtrTy = cast<PointerType>(PtrsTy->getElementType());			auto *PtrTy = cast<PointerType>(PtrsTy->getElementType());
	unsigned NumElts = PtrsTy->getNumElements();			ElementCount EC = PtrsTy->getElementCount();
	auto *DataTy = FixedVectorType::get(PtrTy->getElementType(), NumElts);			auto *DataTy = VectorType::get(PtrTy->getElementType(), EC);

	if (!Mask)			if (!Mask)
	Mask = Constant::getAllOnesValue(			Mask = Constant::getAllOnesValue(
	FixedVectorType::get(Type::getInt1Ty(Context), NumElts));			VectorType::get(Type::getInt1Ty(Context), EC));

	if (!PassThru)			if (!PassThru)
	PassThru = UndefValue::get(DataTy);			PassThru = UndefValue::get(DataTy);

	Type *OverloadedTypes[] = {DataTy, PtrsTy};			Type *OverloadedTypes[] = {DataTy, PtrsTy};
	Value *Ops[] = {Ptrs, getInt32(Alignment.value()), Mask, PassThru};			Value *Ops[] = {Ptrs, getInt32(Alignment.value()), Mask, PassThru};

	// We specify only one type when we create this intrinsic. Types of other			// We specify only one type when we create this intrinsic. Types of other
	// arguments are derived from this type.			// arguments are derived from this type.
	return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, OverloadedTypes,			return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, OverloadedTypes,
	Name);			Name);
	}			}

	/// Create a call to a Masked Scatter intrinsic.			/// Create a call to a Masked Scatter intrinsic.
	/// \p Data - data to be stored,			/// \p Data - data to be stored,
	/// \p Ptrs - the vector of pointers, where the \p Data elements should be			/// \p Ptrs - the vector of pointers, where the \p Data elements should be
	/// stored			/// stored
	/// \p Align - alignment for one element			/// \p Align - alignment for one element
	/// \p Mask - vector of booleans which indicates what vector lanes should			/// \p Mask - vector of booleans which indicates what vector lanes should
	/// be accessed in memory			/// be accessed in memory
	CallInst IRBuilderBase::CreateMaskedScatter(Value Data, Value *Ptrs,			CallInst IRBuilderBase::CreateMaskedScatter(Value Data, Value *Ptrs,
	Align Alignment, Value *Mask) {			Align Alignment, Value *Mask) {
	auto *PtrsTy = cast<FixedVectorType>(Ptrs->getType());			auto *PtrsTy = cast<VectorType>(Ptrs->getType());
	auto *DataTy = cast<FixedVectorType>(Data->getType());			auto *DataTy = cast<VectorType>(Data->getType());
	unsigned NumElts = PtrsTy->getNumElements();			ElementCount EC = PtrsTy->getElementCount();

	#ifndef NDEBUG			#ifndef NDEBUG
	auto PtrTy = cast<PointerType>(PtrsTy->getElementType());			auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
	assert(NumElts == DataTy->getNumElements() &&			assert(EC == DataTy->getElementCount() &&
	PtrTy->getElementType() == DataTy->getElementType() &&			PtrTy->getElementType() == DataTy->getElementType() &&
	"Incompatible pointer and data types");			"Incompatible pointer and data types");
	#endif			#endif

	if (!Mask)			if (!Mask)
	Mask = Constant::getAllOnesValue(			Mask = Constant::getAllOnesValue(
	FixedVectorType::get(Type::getInt1Ty(Context), NumElts));			VectorType::get(Type::getInt1Ty(Context), EC));

	Type *OverloadedTypes[] = {DataTy, PtrsTy};			Type *OverloadedTypes[] = {DataTy, PtrsTy};
	Value *Ops[] = {Data, Ptrs, getInt32(Alignment.value()), Mask};			Value *Ops[] = {Data, Ptrs, getInt32(Alignment.value()), Mask};

	// We specify only one type when we create this intrinsic. Types of other			// We specify only one type when we create this intrinsic. Types of other
	// arguments are derived from this type.			// arguments are derived from this type.
	return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes);			return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes);
	}			}
	▲ Show 20 Lines • Show All 576 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,422 Lines • ▼ Show 20 Lines	for (Instruction &I : *BB) {
// Conditional loads and stores should be scalarized and predicated.		// Conditional loads and stores should be scalarized and predicated.
// isScalarWithPredication cannot be used here since masked		// isScalarWithPredication cannot be used here since masked
// gather/scatters are not considered scalar with predication.		// gather/scatters are not considered scalar with predication.
!Legal->blockNeedsPredication(I.getParent())) {		!Legal->blockNeedsPredication(I.getParent())) {
// TODO: Avoid replicating loads and stores instead of		// TODO: Avoid replicating loads and stores instead of
// relying on instcombine to remove them.		// relying on instcombine to remove them.
// Load: Scalar load + broadcast		// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract		// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract

		// For Scalable vectors, use G/S instructions instead of a scalarised
		// store for now, until we add a mechanism to extract the last lane of
		// the vector. This is allowed, since llvm.masked.scatter is guaranteed
		// to be an ordered store (from first to last element).
		if (VF.isScalable())
		setWideningDecision(&I, VF, CM_GatherScatter,
		/Cost=/VF.getKnownMinValue());
		else {
unsigned Cost = getUniformMemOpCost(&I, VF);		unsigned Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);		setWideningDecision(&I, VF, CM_Scalarize, Cost);
		}
continue;		continue;
}		}

// We assume that widening is the best solution when possible.		// We assume that widening is the best solution when possible.
if (memoryInstructionCanBeWidened(&I, VF)) {		if (memoryInstructionCanBeWidened(&I, VF)) {
unsigned Cost = getConsecutiveMemOpCost(&I, VF);		unsigned Cost = getConsecutiveMemOpCost(&I, VF);
int ConsecutiveStride =		int ConsecutiveStride =
Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));		Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
assert((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) &&		assert((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) &&
"Expected consecutive stride.");		"Expected consecutive stride.");
InstWidening Decision =		InstWidening Decision =
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;		ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
setWideningDecision(&I, VF, Decision, Cost);		setWideningDecision(&I, VF, Decision, Cost);
continue;		continue;
}		}

		// For Scalable vectors, all nodes must be widened using gather/scatter
		// or interleaved loads/stores. For now, default to gather/scatter.
		if (VF.isScalable()) {
		setWideningDecision(&I, VF, CM_GatherScatter,
		/Cost=/VF.getKnownMinValue());
		continue;
		}

// Choose between Interleaving, Gather/Scatter or Scalarization.		// Choose between Interleaving, Gather/Scatter or Scalarization.
unsigned InterleaveCost = std::numeric_limits<unsigned>::max();		unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
unsigned NumAccesses = 1;		unsigned NumAccesses = 1;
if (isAccessInterleaved(&I)) {		if (isAccessInterleaved(&I)) {
auto Group = getInterleavedAccessGroup(&I);		auto Group = getInterleavedAccessGroup(&I);
assert(Group && "Fail to get an interleaved access group.");		assert(Group && "Fail to get an interleaved access group.");

// Make one decision for the whole group.		// Make one decision for the whole group.
▲ Show 20 Lines • Show All 2,245 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/scalable-loop-invariant-store-unpredicated-body-scalar-tail.ll

This file was added.

				; For now this test requires aarch64-registered-target, until we can
				; also pass the loop hint as a 'force-vector-width' flag to opt.
				; REQUIRES: aarch64-registered-target
				ctetreauUnsubmitted Not Done Reply Inline Actions I suppose this is why you don't want to actually merge this currently? What happens if if it's not aarch64? ctetreau: I suppose this is why you don't want to actually merge this currently? What happens if if it's…
				sdesmalenAuthorUnsubmitted Done Reply Inline Actions The `REQUIRES: aarch64-registered-target` is actually unnecessary, not sure why I thought this was needed. This patch is probably simple enough to be reviewed as-is. The other POC patches I've split up into smaller NFC patches, but there is little to simplify here. sdesmalen: The `REQUIRES: aarch64-registered-target` is actually unnecessary, not sure why I thought this…

				; RUN: opt -S -loop-vectorize -instcombine < %s \| FileCheck %s

				source_filename = "loop.c"
				target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
				target triple = "aarch64-unknown-linux-gnu"

				; This test has an invariant store to a[42], which it vectorizes using a masked.scatter operation.

				; CHECK: for.body.preheader:
				; CHECK-DAG: %wide.trip.count = zext i32 %N to i64
				; CHECK-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
				; CHECK-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
				; CHECK-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count

				; CHECK: vector.ph:
				; CHECK-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
				; CHECK-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
				; CHECK-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
				; CHECK: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf

				; CHECK: vector.body:
				; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; CHECK: %[[IDXB:.]] = getelementptr inbounds double, double %b, i64 %index
				; CHECK: %[[IDXB_CAST:.]] = bitcast double %[[IDXB]] to <vscale x 4 x double>*
				; CHECK: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
				; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
				; CHECK: %[[IDXA:.]] = getelementptr inbounds double, double %a, i64 42
				; CHECK: %[[SPLATINSERT:.]] = insertelement <vscale x 4 x double> undef, double* %[[IDXA]], i32 0
				; CHECK: %[[SPLAT:.]] = shufflevector <vscale x 4 x double> %[[SPLATINSERT]], <vscale x 4 x double*> undef, <vscale x 4 x i32> zeroinitializer
				; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> %[[FADD]], <vscale x 4 x double*> %[[SPLAT]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> undef, i1 true, i32 0), <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer))
				; CHECK: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
				; CHECK: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
				; CHECK: %index.next = add i64 %index, %[[VSCALEX4]]
				; CHECK: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
				; CHECK: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5

				define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
				entry:
				%cmp7 = icmp sgt i32 %N, 0
				br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

				for.body.preheader: ; preds = %entry
				%wide.trip.count = zext i32 %N to i64
				br label %for.body

				for.cond.cleanup: ; preds = %for.body, %entry
				ret void

				for.body: ; preds = %for.body.preheader, %for.body
				%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
				%arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
				%0 = load double, double* %arrayidx, align 8
				%add = fadd double %0, 1.000000e+00
				%arrayidx2 = getelementptr inbounds double, double* %a, i64 42
				store double %add, double* %arrayidx2, align 8
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
				br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
				}

				!1 = distinct !{!1, !2, !3}
				!2 = !{!"llvm.loop.vectorize.width", !4}
				!3 = !{!"llvm.loop.interleave.count", i32 1}
				!4 = !{i32 4, i1 1}