This is an archive of the discontinued LLVM Phabricator instance.

We define a generic intrinsic, argue about the name and the exact semantics, adding details to the language ref.
Add a target hook from the vectorizer to opt into using them for runtime checks.
Lower them generically to a series of compares and whatnot in DAG (this may be difficult depending on the exact semantics)
Under AArch64 we expand it to a whilewr and a csel last (I think). Which can then hopefully optimize to use b.last.

At least that is how I think I would expect it to work, with an intrinsic that accepts two pointers or integers of pointer size and produces an i1. The alternative would be just match it in the backend. Unfortunately the semantics of whilewr don't seem not super obvious. I think the b variant performs (VL -1) < zext(B) - zext(A)) | (zext(B) - zext(A)) > 0 for the last lane, which is a little odd for values where A+VL wraps around 0 and probably makes direct matching difficult.

We would also need to account for UF correctly, which might be possible using a different element size.

Thanks very much for detail idea.

Allen abandoned this revision.Sep 20 2023, 7:19 PM

Herald added a subscriber: wangpc. · View Herald TranscriptSep 20 2023, 7:19 PM

Revision Contents

Path

Size

llvm/

include/

llvm/

Analysis/

TargetTransformInfo.h

11 lines

TargetTransformInfoImpl.h

3 lines

CodeGen/

BasicTTIImpl.h

4 lines

IR/

Intrinsics.td

5 lines

Transforms/

Utils/

LoopUtils.h

9 lines

lib/

Analysis/

TargetTransformInfo.cpp

5 lines

Target/

AArch64/

AArch64TargetTransformInfo.h

3 lines

AArch64TargetTransformInfo.cpp

24 lines

Transforms/

Utils/

LoopUtils.cpp

40 lines

Vectorize/

LoopVectorize.cpp

2 lines

test/

Transforms/

LoopVectorize/

AArch64/

sve2-runtime-check-size-based-threshold.ll

151 lines

Diff 477736

llvm/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 1,539 Lines • ▼ Show 20 Lines

/// \returns How the target needs this vector-predicated operation to be		/// \returns How the target needs this vector-predicated operation to be
/// transformed.		/// transformed.
VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const;		VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const;
/// @}		/// @}

/// @}		/// @}

		/// Check whether the correspond target-specific intrinsic is supported or
		/// not, return zeros when don't support.
		unsigned getTargetSupportedIntrinsic(const Intrinsic::ID IID, int VF = 0) const;

private:		private:
/// The abstract base class used to type erase specific TTI		/// The abstract base class used to type erase specific TTI
/// implementations.		/// implementations.
class Concept;		class Concept;

/// The template model for the base class which wraps a concrete		/// The template model for the base class which wraps a concrete
/// implementation in a type erased interface.		/// implementation in a type erased interface.
template <typename T> class Model;		template <typename T> class Model;
▲ Show 20 Lines • Show All 335 Lines • ▼ Show 20 Lines	public:
virtual unsigned getGISelRematGlobalCost() const = 0;		virtual unsigned getGISelRematGlobalCost() const = 0;
virtual unsigned getMinTripCountTailFoldingThreshold() const = 0;		virtual unsigned getMinTripCountTailFoldingThreshold() const = 0;
virtual bool enableScalableVectorization() const = 0;		virtual bool enableScalableVectorization() const = 0;
virtual bool supportsScalableVectors() const = 0;		virtual bool supportsScalableVectors() const = 0;
virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,		virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const = 0;		Align Alignment) const = 0;
virtual VPLegalization		virtual VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;		getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
		virtual unsigned getTargetSupportedIntrinsic(const Intrinsic::ID IID,
		int VF = 0) const = 0;
};		};

template <typename T>		template <typename T>
class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {		class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
T Impl;		T Impl;

public:		public:
Model(T Impl) : Impl(std::move(Impl)) {}		Model(T Impl) : Impl(std::move(Impl)) {}
▲ Show 20 Lines • Show All 653 Lines • ▼ Show 20 Lines	bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const override {		Align Alignment) const override {
return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);		return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
}		}

VPLegalization		VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const override {		getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
return Impl.getVPLegalizationStrategy(PI);		return Impl.getVPLegalizationStrategy(PI);
}		}

		unsigned getTargetSupportedIntrinsic(const Intrinsic::ID IID,
		int VF) const override {
		return Impl.getTargetSupportedIntrinsic(IID, VF);
		}
};		};

template <typename T>		template <typename T>
TargetTransformInfo::TargetTransformInfo(T Impl)		TargetTransformInfo::TargetTransformInfo(T Impl)
: TTIImpl(new Model<T>(Impl)) {}		: TTIImpl(new Model<T>(Impl)) {}

/// Analysis pass providing the \c TargetTransformInfo.		/// Analysis pass providing the \c TargetTransformInfo.
///		///
▲ Show 20 Lines • Show All 94 Lines • Show Last 20 Lines

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 847 Lines • ▼ Show 20 Lines	public:
}		}

TargetTransformInfo::VPLegalization		TargetTransformInfo::VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const {		getVPLegalizationStrategy(const VPIntrinsic &PI) const {
return TargetTransformInfo::VPLegalization(		return TargetTransformInfo::VPLegalization(
/* EVLParamStrategy */ TargetTransformInfo::VPLegalization::Discard,		/* EVLParamStrategy */ TargetTransformInfo::VPLegalization::Discard,
/* OperatorStrategy */ TargetTransformInfo::VPLegalization::Convert);		/* OperatorStrategy */ TargetTransformInfo::VPLegalization::Convert);
}		}
		unsigned getTargetSupportedIntrinsic(const Intrinsic::ID IID, int VF) const {
		return 0;
		}

protected:		protected:
// Obtain the minimum required size to hold the value (without the sign)		// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.		// In case of a vector it returns the min required size for one element.
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const {		unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const {
if (isa<ConstantDataVector>(Val) \|\| isa<ConstantVector>(Val)) {		if (isa<ConstantDataVector>(Val) \|\| isa<ConstantVector>(Val)) {
const auto *VectorValue = cast<Constant>(Val);		const auto *VectorValue = cast<Constant>(Val);

▲ Show 20 Lines • Show All 429 Lines • Show Last 20 Lines

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 683 Lines • ▼ Show 20 Lines	public:
virtual bool enableWritePrefetching() const {		virtual bool enableWritePrefetching() const {
return getST()->enableWritePrefetching();		return getST()->enableWritePrefetching();
}		}

virtual bool shouldPrefetchAddressSpace(unsigned AS) const {		virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
return getST()->shouldPrefetchAddressSpace(AS);		return getST()->shouldPrefetchAddressSpace(AS);
}		}

		virtual unsigned getTargetSupportedIntrinsic(const Intrinsic::ID IID,
		int VF) const {
		return 0;
		}
/// @}		/// @}

/// \name Vector TTI Implementations		/// \name Vector TTI Implementations
/// @{		/// @{

TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {		TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
return TypeSize::getFixed(32);		return TypeSize::getFixed(32);
}		}
▲ Show 20 Lines • Show All 1,717 Lines • Show Last 20 Lines

llvm/include/llvm/IR/Intrinsics.td

	Show First 20 Lines • Show All 1,846 Lines • ▼ Show 20 Lines

	def int_masked_compressstore:			def int_masked_compressstore:
	DefaultAttrsIntrinsic<[],			DefaultAttrsIntrinsic<[],
	[llvm_anyvector_ty, LLVMPointerToElt<0>,			[llvm_anyvector_ty, LLVMPointerToElt<0>,
	LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],			LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
	[IntrWriteMem, IntrArgMemOnly, IntrWillReturn,			[IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
	NoCapture<ArgIndex<1>>]>;			NoCapture<ArgIndex<1>>]>;

				// Test whether two pointers are while free of write-after-read/write conflicts.
				def int_whilewr_test : DefaultAttrsIntrinsic<[llvm_i1_ty],
				[llvm_ptr_ty, llvm_ptr_ty],
				[IntrNoMem, IntrWillReturn, IntrSpeculatable]>;

	// Test whether a pointer is associated with a type metadata identifier.			// Test whether a pointer is associated with a type metadata identifier.
	def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],			def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
	[IntrNoMem, IntrWillReturn, IntrSpeculatable]>;			[IntrNoMem, IntrWillReturn, IntrSpeculatable]>;

	// Safely loads a function pointer from a virtual table pointer using type metadata.			// Safely loads a function pointer from a virtual table pointer using type metadata.
	def int_type_checked_load : DefaultAttrsIntrinsic<[llvm_ptr_ty, llvm_i1_ty],			def int_type_checked_load : DefaultAttrsIntrinsic<[llvm_ptr_ty, llvm_i1_ty],
	[llvm_ptr_ty, llvm_i32_ty, llvm_metadata_ty],			[llvm_ptr_ty, llvm_i32_ty, llvm_metadata_ty],
	[IntrNoMem, IntrWillReturn]>;			[IntrNoMem, IntrWillReturn]>;
	▲ Show 20 Lines • Show All 296 Lines • Show Last 20 Lines

llvm/include/llvm/Transforms/Utils/LoopUtils.h

	Show First 20 Lines • Show All 502 Lines • ▼ Show 20 Lines

	/// Add code that checks at runtime if the accessed arrays in \p PointerChecks			/// Add code that checks at runtime if the accessed arrays in \p PointerChecks
	/// overlap. Returns the final comparator value or NULL if no check is needed.			/// overlap. Returns the final comparator value or NULL if no check is needed.
	Value *			Value *
	addRuntimeChecks(Instruction Loc, Loop TheLoop,			addRuntimeChecks(Instruction Loc, Loop TheLoop,
	const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,			const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
	SCEVExpander &Expander);			SCEVExpander &Expander);

	Value *addDiffRuntimeChecks(			Value *
	Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,			addDiffRuntimeChecks(Instruction *Loc, ArrayRef<PointerDiffInfo> Checks,
	function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC);			SCEVExpander &Expander,
				function_ref<Value *(IRBuilderBase &, unsigned)> GetVF,
				unsigned IC, const TargetTransformInfo *TTI,
				bool Scalable = 0);

	/// Struct to hold information about a partially invariant condition.			/// Struct to hold information about a partially invariant condition.
	struct IVConditionInfo {			struct IVConditionInfo {
	/// Instructions that need to be duplicated and checked for the unswitching			/// Instructions that need to be duplicated and checked for the unswitching
	/// condition.			/// condition.
	SmallVector<Instruction *> InstToDuplicate;			SmallVector<Instruction *> InstToDuplicate;

	/// Constant to indicate for which value the condition is invariant.			/// Constant to indicate for which value the condition is invariant.
	Show All 30 Lines

llvm/lib/Analysis/TargetTransformInfo.cpp

Show First 20 Lines • Show All 1,174 Lines • ▼ Show 20 Lines	bool TargetTransformInfo::enableScalableVectorization() const {
return TTIImpl->enableScalableVectorization();		return TTIImpl->enableScalableVectorization();
}		}

bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,		bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const {		Align Alignment) const {
return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);		return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
}		}

		unsigned TargetTransformInfo::getTargetSupportedIntrinsic(
		const Intrinsic::ID IID, int VF) const {
		return TTIImpl->getTargetSupportedIntrinsic(IID, VF);
		}

TargetTransformInfo::Concept::~Concept() = default;		TargetTransformInfo::Concept::~Concept() = default;

TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}		TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}

TargetIRAnalysis::TargetIRAnalysis(		TargetIRAnalysis::TargetIRAnalysis(
std::function<Result(const Function &)> TTICallback)		std::function<Result(const Function &)> TTICallback)
: TTICallback(std::move(TTICallback)) {}		: TTICallback(std::move(TTICallback)) {}

▲ Show 20 Lines • Show All 41 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Show First 20 Lines • Show All 380 Lines • ▼ Show 20 Lines	public:
/// Return the cost of the scaling factor used in the addressing		/// Return the cost of the scaling factor used in the addressing
/// mode represented by AM for this target, for a load/store		/// mode represented by AM for this target, for a load/store
/// of the specified type.		/// of the specified type.
/// If the AM is supported, the return value must be >= 0.		/// If the AM is supported, the return value must be >= 0.
/// If the AM is not supported, it returns a negative value.		/// If the AM is not supported, it returns a negative value.
InstructionCost getScalingFactorCost(Type Ty, GlobalValue BaseGV,		InstructionCost getScalingFactorCost(Type Ty, GlobalValue BaseGV,
int64_t BaseOffset, bool HasBaseReg,		int64_t BaseOffset, bool HasBaseReg,
int64_t Scale, unsigned AddrSpace) const;		int64_t Scale, unsigned AddrSpace) const;

		unsigned getTargetSupportedIntrinsic(const Intrinsic::ID IID,
		int VF = 0) const override;
/// @}		/// @}
};		};

} // end namespace llvm		} // end namespace llvm

#endif // LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H		#endif // LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Show First 20 Lines • Show All 3,229 Lines • ▼ Show 20 Lines	AArch64TTIImpl::getScalingFactorCost(Type Ty, GlobalValue BaseGV,
AM.HasBaseReg = HasBaseReg;		AM.HasBaseReg = HasBaseReg;
AM.Scale = Scale;		AM.Scale = Scale;
if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))		if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
// Scale represents reg2 * scale, thus account for 1 if		// Scale represents reg2 * scale, thus account for 1 if
// it is not equal to 0 or 1.		// it is not equal to 0 or 1.
return AM.Scale != 0 && AM.Scale != 1;		return AM.Scale != 0 && AM.Scale != 1;
return -1;		return -1;
}		}

		unsigned AArch64TTIImpl::getTargetSupportedIntrinsic(const Intrinsic::ID IID,
		int VF) const {
		switch (IID) {
		case Intrinsic::whilewr_test:
		if (!ST->hasSVE2())
		return 0;
		switch (VF) {
		case 2:
		return Intrinsic::aarch64_sve_whilewr_d;
		case 4:
		return Intrinsic::aarch64_sve_whilewr_s;
		case 8:
		return Intrinsic::aarch64_sve_whilewr_h;
		case 16:
		return Intrinsic::aarch64_sve_whilewr_b;
		default:
		return 0;
		}
		default:
		break;
		}
		return 0;
		}

llvm/lib/Transforms/Utils/LoopUtils.cpp

Show All 29 Lines
#include "llvm/Analysis/MemorySSAUpdater.h"		#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/ScalarEvolution.h"		#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"		#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"		#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/IR/DIBuilder.h"		#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/Dominators.h"		#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
		#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/MDBuilder.h"		#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"		#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"		#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/ValueHandle.h"		#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"		#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"		#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"		#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"		#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"

using namespace llvm;		using namespace llvm;
using namespace llvm::PatternMatch;		using namespace llvm::PatternMatch;

#define DEBUG_TYPE "loop-utils"		#define DEBUG_TYPE "loop-utils"

static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";		static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
static const char *LLVMLoopDisableLICM = "llvm.licm.disable";		static const char *LLVMLoopDisableLICM = "llvm.licm.disable";
		// This feature requires the backend to support sve-whilewr instrunction.
		static cl::opt<bool> SVEWhileWR("sve-whilewr", cl::Hidden,
		cl::init(true),
		cl::desc("Enable whilewr instruction"));


bool llvm::formDedicatedExitBlocks(Loop L, DominatorTree DT, LoopInfo *LI,		bool llvm::formDedicatedExitBlocks(Loop L, DominatorTree DT, LoopInfo *LI,
MemorySSAUpdater *MSSAU,		MemorySSAUpdater *MSSAU,
bool PreserveLCSSA) {		bool PreserveLCSSA) {
bool Changed = false;		bool Changed = false;

// We re-use a vector for the in-loop predecesosrs.		// We re-use a vector for the in-loop predecesosrs.
SmallVector<BasicBlock *, 4> InLoopPredecessors;		SmallVector<BasicBlock *, 4> InLoopPredecessors;
▲ Show 20 Lines • Show All 1,598 Lines • ▼ Show 20 Lines	for (const auto &Check : ExpandedChecks) {
MemoryRuntimeCheck = IsConflict;		MemoryRuntimeCheck = IsConflict;
}		}

return MemoryRuntimeCheck;		return MemoryRuntimeCheck;
}		}

Value *llvm::addDiffRuntimeChecks(		Value *llvm::addDiffRuntimeChecks(
Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,		Instruction *Loc, ArrayRef<PointerDiffInfo> Checks, SCEVExpander &Expander,
function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC) {		function_ref<Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC,
		const TargetTransformInfo *TTI, bool Scalable) {

LLVMContext &Ctx = Loc->getContext();		LLVMContext &Ctx = Loc->getContext();
IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx,		IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx,
Loc->getModule()->getDataLayout());		Loc->getModule()->getDataLayout());
		auto &DL = Loc->getModule()->getDataLayout();
ChkBuilder.SetInsertPoint(Loc);		ChkBuilder.SetInsertPoint(Loc);
// Our instructions might fold to a constant.		// Our instructions might fold to a constant.
Value *MemoryRuntimeCheck = nullptr;		Value *MemoryRuntimeCheck = nullptr;

for (const auto &C : Checks) {		for (const auto &C : Checks) {
Type *Ty = C.SinkStart->getType();		Type *Ty = C.SinkStart->getType();
// Compute VF * IC * AccessSize.		// Compute VF * IC * AccessSize.
auto *VFTimesUFTimesSize =		auto *VFTimesUFTimesSize =
ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),		ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()),
ConstantInt::get(Ty, IC * C.AccessSize));		ConstantInt::get(Ty, IC * C.AccessSize));
Value *Sink = Expander.expandCodeFor(C.SinkStart, Ty, Loc);		Value *Sink = Expander.expandCodeFor(C.SinkStart, Ty, Loc);
Value *Src = Expander.expandCodeFor(C.SrcStart, Ty, Loc);		Value *Src = Expander.expandCodeFor(C.SrcStart, Ty, Loc);
if (C.NeedsFreeze) {		if (C.NeedsFreeze) {
IRBuilder<> Builder(Loc);		IRBuilder<> Builder(Loc);
Sink = Builder.CreateFreeze(Sink, Sink->getName() + ".fr");		Sink = Builder.CreateFreeze(Sink, Sink->getName() + ".fr");
Src = Builder.CreateFreeze(Src, Src->getName() + ".fr");		Src = Builder.CreateFreeze(Src, Src->getName() + ".fr");
}		}
Value *Diff = ChkBuilder.CreateSub(Sink, Src);		Value *Diff = ChkBuilder.CreateSub(Sink, Src);
Value *IsConflict =		Value *IsConflict;

		auto *SinkCI = dyn_cast<CastInst>(Sink);
		auto *SrcCI = dyn_cast<CastInst>(Src);
		Intrinsic::ID TargetIID =
		TTI->getTargetSupportedIntrinsic(Intrinsic::whilewr_test, C.AccessSize);
		if (SVEWhileWR && Scalable == 1 && SinkCI && SrcCI &&
		SinkCI->getOpcode() == Instruction::PtrToInt &&
		SrcCI->getOpcode() == Instruction::PtrToInt && TargetIID) {
		ElementCount VF = ElementCount::get(C.AccessSize, true);
		auto *M = ChkBuilder.GetInsertBlock()->getModule();
		Type *BoolVecTy = VectorType::get(ChkBuilder.getInt1Ty(), VF);
		Type *Ptr = PointerType::get(ChkBuilder.getInt32Ty(), 0);
		Function *ActiveMaskFunc =
		Intrinsic::getDeclaration(M, TargetIID, {BoolVecTy, Ptr});
		Value *Pred = ChkBuilder.CreateCall(
		ActiveMaskFunc, {SinkCI->getOperand(0), SrcCI->getOperand(0)});
		// %vscale = call i64 @llvm.vscale.i64()
		// %shl = shl nuw nsw i64 %vscale, 3
		// %idx = add nuw nsw i64 %shl, -1
		// %bit = extractelement <vscale x 8 x i1> %a, i64 %idx
		auto *VFSize = GetVF(ChkBuilder, C.AccessSize);
		Value *LastIdx = ChkBuilder.CreateSub(VFSize, ConstantInt::get(Ty, 1));
		IsConflict = ChkBuilder.CreateExtractElement(Pred, LastIdx, "LastElt");
		} else {
		IsConflict =
ChkBuilder.CreateICmpULT(Diff, VFTimesUFTimesSize, "diff.check");		ChkBuilder.CreateICmpULT(Diff, VFTimesUFTimesSize, "diff.check");
		}

if (MemoryRuntimeCheck) {		if (MemoryRuntimeCheck) {
IsConflict =		IsConflict =
ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");		ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
}		}
MemoryRuntimeCheck = IsConflict;		MemoryRuntimeCheck = IsConflict;
}		}

▲ Show 20 Lines • Show All 176 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,928 Lines • ▼ Show 20 Lines	if (RtPtrChecking.Need) {
Value *RuntimeVF = nullptr;		Value *RuntimeVF = nullptr;
MemRuntimeCheckCond = addDiffRuntimeChecks(		MemRuntimeCheckCond = addDiffRuntimeChecks(
MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,		MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
[VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {		[VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
if (!RuntimeVF)		if (!RuntimeVF)
RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);		RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
return RuntimeVF;		return RuntimeVF;
},		},
IC);		IC, TTI, VF.isScalable());
} else {		} else {
MemRuntimeCheckCond =		MemRuntimeCheckCond =
addRuntimeChecks(MemCheckBlock->getTerminator(), L,		addRuntimeChecks(MemCheckBlock->getTerminator(), L,
RtPtrChecking.getChecks(), MemCheckExp);		RtPtrChecking.getChecks(), MemCheckExp);
}		}
assert(MemRuntimeCheckCond &&		assert(MemRuntimeCheckCond &&
"no RT checks generated although RtPtrChecking "		"no RT checks generated although RtPtrChecking "
"claimed checks are required");		"claimed checks are required");
▲ Show 20 Lines • Show All 8,722 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/sve2-runtime-check-size-based-threshold.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -passes=loop-vectorize -mattr=+sve2 -prefer-predicate-over-epilogue=scalar-epilogue -S %s \| FileCheck %s

				target triple = "aarch64-unknown-linux-gnu"

				; Test case where the minimum profitable trip count due to runtime checks
				; exceeds VF.getKnownMinValue() * UF.
				; FIXME: The code currently incorrectly is missing a umax(VF * UF, 28).
				define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr %src.1, ptr %src.2, i64 %n) {
				; CHECK-LABEL: @min_trip_count_due_to_runtime_checks_1(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[SRC_25:%.]] = ptrtoint ptr [[SRC_2:%.]] to i64
				; CHECK-NEXT: [[SRC_13:%.]] = ptrtoint ptr [[SRC_1:%.]] to i64
				; CHECK-NEXT: [[DST_12:%.]] = ptrtoint ptr [[DST_1:%.]] to i64
				; CHECK-NEXT: [[DST_21:%.]] = ptrtoint ptr [[DST_2:%.]] to i64
				; CHECK-NEXT: [[UMAX:%.]] = call i64 @llvm.umax.i64(i64 [[N:%.]], i64 1)
				; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
				; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
				; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 40, i64 [[TMP1]])
				; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP2]]
				; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.]], label [[VECTOR_MEMCHECK:%.]]
				; CHECK: vector.memcheck:
				; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
				; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
				; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
				; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[DST_21]], [[DST_12]]
				; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.whilewr.h.nxv8i1.p0(ptr [[DST_2]], ptr [[DST_1]])
				; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP4]], 1
				; CHECK-NEXT: [[LASTELT:%.*]] = extractelement <vscale x 8 x i1> [[TMP7]], i64 [[TMP8]]
				; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP4]], 16
				; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[DST_12]], [[SRC_13]]
				; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.whilewr.h.nxv8i1.p0(ptr [[DST_1]], ptr [[SRC_1]])
				; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP4]], 1
				; CHECK-NEXT: [[LASTELT4:%.*]] = extractelement <vscale x 8 x i1> [[TMP11]], i64 [[TMP12]]
				; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[LASTELT]], [[LASTELT4]]
				; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP4]], 16
				; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[DST_12]], [[SRC_25]]
				; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.whilewr.h.nxv8i1.p0(ptr [[DST_1]], ptr [[SRC_2]])
				; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP4]], 1
				; CHECK-NEXT: [[LASTELT6:%.*]] = extractelement <vscale x 8 x i1> [[TMP15]], i64 [[TMP16]]
				; CHECK-NEXT: [[CONFLICT_RDX7:%.*]] = or i1 [[CONFLICT_RDX]], [[LASTELT6]]
				; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP4]], 16
				; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[DST_21]], [[SRC_13]]
				; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.whilewr.h.nxv8i1.p0(ptr [[DST_2]], ptr [[SRC_1]])
				; CHECK-NEXT: [[TMP20:%.*]] = sub i64 [[TMP4]], 1
				; CHECK-NEXT: [[LASTELT8:%.*]] = extractelement <vscale x 8 x i1> [[TMP19]], i64 [[TMP20]]
				; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX7]], [[LASTELT8]]
				; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP4]], 16
				; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[DST_21]], [[SRC_25]]
				; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.whilewr.h.nxv8i1.p0(ptr [[DST_2]], ptr [[SRC_2]])
				; CHECK-NEXT: [[TMP24:%.*]] = sub i64 [[TMP4]], 1
				; CHECK-NEXT: [[LASTELT10:%.*]] = extractelement <vscale x 8 x i1> [[TMP23]], i64 [[TMP24]]
				; CHECK-NEXT: [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX9]], [[LASTELT10]]
				; CHECK-NEXT: br i1 [[CONFLICT_RDX11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
				; CHECK: vector.ph:
				; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
				; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4
				; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP26]]
				; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 0
				; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
				; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 2
				; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP29]], 0
				; CHECK-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 1
				; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[INDEX]], [[TMP31]]
				; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[TMP27]]
				; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[TMP32]]
				; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP27]]
				; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP32]]
				; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i64, ptr [[TMP33]], i32 0
				; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP37]], align 4
				; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.vscale.i32()
				; CHECK-NEXT: [[TMP39:%.*]] = mul i32 [[TMP38]], 2
				; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i64, ptr [[TMP33]], i32 [[TMP39]]
				; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP40]], align 4
				; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i64, ptr [[TMP35]], i32 0
				; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP41]], align 4
				; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vscale.i32()
				; CHECK-NEXT: [[TMP43:%.*]] = mul i32 [[TMP42]], 2
				; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i64, ptr [[TMP35]], i32 [[TMP43]]
				; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP44]], align 4
				; CHECK-NEXT: [[TMP45:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD13]]
				; CHECK-NEXT: [[TMP46:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD12]], [[WIDE_LOAD14]]
				; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[TMP27]]
				; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[TMP32]]
				; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP27]]
				; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP32]]
				; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i64, ptr [[TMP47]], i32 0
				; CHECK-NEXT: store <vscale x 2 x i64> [[TMP45]], ptr [[TMP51]], align 4
				; CHECK-NEXT: [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
				; CHECK-NEXT: [[TMP53:%.*]] = mul i32 [[TMP52]], 2
				; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i64, ptr [[TMP47]], i32 [[TMP53]]
				; CHECK-NEXT: store <vscale x 2 x i64> [[TMP46]], ptr [[TMP54]], align 4
				; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i64, ptr [[TMP49]], i32 0
				; CHECK-NEXT: store <vscale x 2 x i64> [[TMP45]], ptr [[TMP55]], align 4
				; CHECK-NEXT: [[TMP56:%.*]] = call i32 @llvm.vscale.i32()
				; CHECK-NEXT: [[TMP57:%.*]] = mul i32 [[TMP56]], 2
				; CHECK-NEXT: [[TMP58:%.*]] = getelementptr i64, ptr [[TMP49]], i32 [[TMP57]]
				; CHECK-NEXT: store <vscale x 2 x i64> [[TMP46]], ptr [[TMP58]], align 4
				; CHECK-NEXT: [[TMP59:%.*]] = call i64 @llvm.vscale.i64()
				; CHECK-NEXT: [[TMP60:%.*]] = mul i64 [[TMP59]], 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP60]]
				; CHECK-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
				; CHECK-NEXT: br i1 [[TMP61]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
				; CHECK: middle.block:
				; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
				; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
				; CHECK: scalar.ph:
				; CHECK-NEXT: [[BC_RESUME_VAL:%.]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.]] ], [ 0, [[VECTOR_MEMCHECK]] ]
				; CHECK-NEXT: br label [[LOOP:%.*]]
				; CHECK: loop:
				; CHECK-NEXT: [[IV:%.]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.]], [[LOOP]] ]
				; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[IV]]
				; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[IV]]
				; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 4
				; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 4
				; CHECK-NEXT: [[ADD:%.*]] = add i64 [[L_1]], [[L_2]]
				; CHECK-NEXT: [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[IV]]
				; CHECK-NEXT: [[GEP_DST_2:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[IV]]
				; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP_DST_1]], align 4
				; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP_DST_2]], align 4
				; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
				; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
				; CHECK-NEXT: br i1 [[CMP10]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %loop

				loop:
				%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
				%gep.src.1 = getelementptr i64, ptr %src.1, i64 %iv
				%gep.src.2 = getelementptr i64, ptr %src.2, i64 %iv
				%l.1 = load i64, ptr %gep.src.1
				%l.2 = load i64, ptr %gep.src.2
				%add = add i64 %l.1, %l.2
				%gep.dst.1 = getelementptr i64, ptr %dst.1, i64 %iv
				%gep.dst.2 = getelementptr i64, ptr %dst.2, i64 %iv
				store i64 %add, ptr %gep.dst.1
				store i64 %add, ptr %gep.dst.2
				%iv.next = add nsw i64 %iv, 1
				%cmp10 = icmp ult i64 %iv.next, %n
				br i1 %cmp10, label %loop, label %exit

				exit:
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[SVE][AArch64] Use WHILEWR to check write-after-read conflictsAbandonedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 477736

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/include/llvm/IR/Intrinsics.td

llvm/include/llvm/Transforms/Utils/LoopUtils.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/lib/Transforms/Utils/LoopUtils.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/AArch64/sve2-runtime-check-size-based-threshold.ll

[SVE][AArch64] Use WHILEWR to check write-after-read conflicts
AbandonedPublic