Diff 310485

llvm/include/llvm/IR/IRBuilder.h

Show First 20 Lines • Show All 873 Lines • ▼ Show 20 Lines	#endif
/// Create a call to the experimental.gc.relocate intrinsics to		/// Create a call to the experimental.gc.relocate intrinsics to
/// project the relocated value of one pointer from the statepoint.		/// project the relocated value of one pointer from the statepoint.
CallInst CreateGCRelocate(Instruction Statepoint,		CallInst CreateGCRelocate(Instruction Statepoint,
int BaseOffset,		int BaseOffset,
int DerivedOffset,		int DerivedOffset,
Type *ResultType,		Type *ResultType,
const Twine &Name = "");		const Twine &Name = "");

		/// Create a call to llvm.vscale, multiplied by \p Scaling. The type of VScale
		/// will be the same type as that of \p Scaling.
		Value CreateVScale(Constant Scaling, const Twine &Name = "");

/// Create a call to intrinsic \p ID with 1 operand which is mangled on its		/// Create a call to intrinsic \p ID with 1 operand which is mangled on its
/// type.		/// type.
CallInst CreateUnaryIntrinsic(Intrinsic::ID ID, Value V,		CallInst CreateUnaryIntrinsic(Intrinsic::ID ID, Value V,
Instruction *FMFSource = nullptr,		Instruction *FMFSource = nullptr,
const Twine &Name = "");		const Twine &Name = "");

/// Create a call to intrinsic \p ID with 2 operands which is mangled on the		/// Create a call to intrinsic \p ID with 2 operands which is mangled on the
/// first type.		/// first type.
▲ Show 20 Lines • Show All 1,769 Lines • Show Last 20 Lines

llvm/lib/IR/IRBuilder.cpp

Show First 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	static CallInst createCallHelper(Function Callee, ArrayRef<Value *> Ops,
Instruction *FMFSource = nullptr,		Instruction *FMFSource = nullptr,
ArrayRef<OperandBundleDef> OpBundles = {}) {		ArrayRef<OperandBundleDef> OpBundles = {}) {
CallInst *CI = Builder->CreateCall(Callee, Ops, OpBundles, Name);		CallInst *CI = Builder->CreateCall(Callee, Ops, OpBundles, Name);
if (FMFSource)		if (FMFSource)
CI->copyFastMathFlags(FMFSource);		CI->copyFastMathFlags(FMFSource);
return CI;		return CI;
}		}

		Value IRBuilderBase::CreateVScale(Constant Scaling, const Twine &Name) {
		Module *M = GetInsertBlock()->getParent()->getParent();
		assert(isa<ConstantInt>(Scaling) && "Expected constant integer");
		dmgreenUnsubmitted Done Reply Inline Actions This could do with a quick clang-format. dmgreen: This could do with a quick clang-format.
		Function *TheFn =
		Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()});
		CallInst *CI = createCallHelper(TheFn, {}, this, Name);
		return cast<ConstantInt>(Scaling)->getSExtValue() == 1
		? CI
		: CreateMul(CI, Scaling);
		}

CallInst IRBuilderBase::CreateMemSet(Value Ptr, Value Val, Value Size,		CallInst IRBuilderBase::CreateMemSet(Value Ptr, Value Val, Value Size,
MaybeAlign Align, bool isVolatile,		MaybeAlign Align, bool isVolatile,
MDNode TBAATag, MDNode ScopeTag,		MDNode TBAATag, MDNode ScopeTag,
MDNode *NoAliasTag) {		MDNode *NoAliasTag) {
Ptr = getCastedInt8PtrValue(Ptr);		Ptr = getCastedInt8PtrValue(Ptr);
Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)};		Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)};
Type *Tys[] = { Ptr->getType(), Size->getType() };		Type *Tys[] = { Ptr->getType(), Size->getType() };
Module *M = BB->getParent()->getParent();		Module *M = BB->getParent()->getParent();
▲ Show 20 Lines • Show All 1,056 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,115 Lines • ▼ Show 20 Lines	if (I->getDebugLoc())
DL = I->getDebugLoc();		DL = I->getDebugLoc();
}		}

OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);		OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
R << "loop not vectorized: ";		R << "loop not vectorized: ";
return R;		return R;
}		}

		/// Return a value for Step multiplied by VF.
		static Value createStepForVF(IRBuilder<> &B, Constant Step, ElementCount VF) {
		assert(isa<ConstantInt>(Step) && "Expected an integer step");
		Constant *StepVal = ConstantInt::get(
		Step->getType(),
		cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
		return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
		}

namespace llvm {		namespace llvm {

void reportVectorizationFailure(const StringRef DebugMsg,		void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,		const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter ORE, Loop TheLoop, Instruction *I) {		OptimizationRemarkEmitter ORE, Loop TheLoop, Instruction *I) {
LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));		LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter /, ORE);		LoopVectorizeHints Hints(TheLoop, true /* doesn't matter /, ORE);
ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),		ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
▲ Show 20 Lines • Show All 1,140 Lines • ▼ Show 20 Lines	Value InnerLoopVectorizer::getStepVector(Value Val, int StartIdx, Value *Step,
return BOp;		return BOp;
}		}

void InnerLoopVectorizer::buildScalarSteps(Value ScalarIV, Value Step,		void InnerLoopVectorizer::buildScalarSteps(Value ScalarIV, Value Step,
Instruction *EntryVal,		Instruction *EntryVal,
const InductionDescriptor &ID) {		const InductionDescriptor &ID) {
// We shouldn't have to build scalar steps if we aren't vectorizing.		// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF.isVector() && "VF should be greater than one");		assert(VF.isVector() && "VF should be greater than one");
assert(!VF.isScalable() &&
"the code below assumes a fixed number of elements at compile time");
// Get the value type and ensure it and the step have the same integer type.		// Get the value type and ensure it and the step have the same integer type.
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();		Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
assert(ScalarIVTy == Step->getType() &&		assert(ScalarIVTy == Step->getType() &&
"Val and Step should have the same type");		"Val and Step should have the same type");

// We build scalar steps for both integer and floating-point induction		// We build scalar steps for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.		// variables. Here, we determine the kind of arithmetic we will perform.
Instruction::BinaryOps AddOp;		Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;		Instruction::BinaryOps MulOp;
if (ScalarIVTy->isIntegerTy()) {		if (ScalarIVTy->isIntegerTy()) {
AddOp = Instruction::Add;		AddOp = Instruction::Add;
MulOp = Instruction::Mul;		MulOp = Instruction::Mul;
} else {		} else {
AddOp = ID.getInductionOpcode();		AddOp = ID.getInductionOpcode();
MulOp = Instruction::FMul;		MulOp = Instruction::FMul;
}		}

// Determine the number of scalars we need to generate for each unroll		// Determine the number of scalars we need to generate for each unroll
// iteration. If EntryVal is uniform, we only need to generate the first		// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.		// lane. Otherwise, we generate all VF values.
unsigned Lanes =		unsigned Lanes =
Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)		Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
? 1		? 1
: VF.getKnownMinValue();		: VF.getKnownMinValue();
		assert((!VF.isScalable() \|\| Lanes == 1) &&
		"Should never scalarize a scalable vector");
// Compute the scalar steps and save the results in VectorLoopValueMap.		// Compute the scalar steps and save the results in VectorLoopValueMap.
for (unsigned Part = 0; Part < UF; ++Part) {		for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {		for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
auto *StartIdx = getSignedIntOrFpConstant(		auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
ScalarIVTy, VF.getKnownMinValue() * Part + Lane);		ScalarIVTy->getScalarSizeInBits());
		Value *StartIdx =
		createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
		if (ScalarIVTy->isFloatingPointTy())
		StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
		StartIdx = addFastMathFlag(Builder.CreateBinOp(
		AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
		// The step returned by `createStepForVF` is a runtime-evaluated value
		// when VF is scalable. Otherwise, it should be folded into a Constant.
		assert((VF.isScalable() \|\| isa<Constant>(StartIdx)) &&
		"Expected StartIdx to be folded to a constant when VF is not "
		"scalable");
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));		auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
		dmgreenUnsubmitted Done Reply Inline Actions Can you add a comment saying this has to potentially construct a scalable step. And that otherwise it should be folded to a constant? dmgreen: Can you add a comment saying this has to potentially construct a scalable step. And that…
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));		auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);		VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);		recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
}		}
}		}
}		}

Value InnerLoopVectorizer::getOrCreateVectorValue(Value V, unsigned Part) {		Value InnerLoopVectorizer::getOrCreateVectorValue(Value V, unsigned Part) {
Show All 25 Lines	if (VF.isScalar()) {
VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);		VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
return ScalarValue;		return ScalarValue;
}		}

// Get the last scalar instruction we generated for V and Part. If the value		// Get the last scalar instruction we generated for V and Part. If the value
// is known to be uniform after vectorization, this corresponds to lane zero		// is known to be uniform after vectorization, this corresponds to lane zero
// of the Part unroll iteration. Otherwise, the last instruction is the one		// of the Part unroll iteration. Otherwise, the last instruction is the one
// we created for the last vector lane of the Part unroll iteration.		// we created for the last vector lane of the Part unroll iteration.
assert(!VF.isScalable() && "scalable vectors not yet supported.");
unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)		unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
? 0		? 0
: VF.getKnownMinValue() - 1;		: VF.getKnownMinValue() - 1;
		assert((!VF.isScalable() \|\| LastLane == 0) &&
		"Scalable vectorization can't lead to any scalarized values.");
auto *LastInst = cast<Instruction>(		auto *LastInst = cast<Instruction>(
VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));		VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));

// Set the insert point after the last scalarized instruction. This ensures		// Set the insert point after the last scalarized instruction. This ensures
// the insertelement sequence will directly follow the scalar definitions.		// the insertelement sequence will directly follow the scalar definitions.
auto OldIP = Builder.saveIP();		auto OldIP = Builder.saveIP();
auto NewIP = std::next(BasicBlock::iterator(LastInst));		auto NewIP = std::next(BasicBlock::iterator(LastInst));
Builder.SetInsertPoint(&*NewIP);		Builder.SetInsertPoint(&*NewIP);
▲ Show 20 Lines • Show All 325 Lines • ▼ Show 20 Lines	LoopVectorizationCostModel::InstWidening Decision =
Cost->getWideningDecision(Instr, VF);		Cost->getWideningDecision(Instr, VF);
assert((Decision == LoopVectorizationCostModel::CM_Widen \|\|		assert((Decision == LoopVectorizationCostModel::CM_Widen \|\|
Decision == LoopVectorizationCostModel::CM_Widen_Reverse \|\|		Decision == LoopVectorizationCostModel::CM_Widen_Reverse \|\|
Decision == LoopVectorizationCostModel::CM_GatherScatter) &&		Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
"CM decision is not to widen the memory instruction");		"CM decision is not to widen the memory instruction");

Type *ScalarDataTy = getMemInstValueType(Instr);		Type *ScalarDataTy = getMemInstValueType(Instr);

assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *DataTy = VectorType::get(ScalarDataTy, VF);		auto *DataTy = VectorType::get(ScalarDataTy, VF);
const Align Alignment = getLoadStoreAlignment(Instr);		const Align Alignment = getLoadStoreAlignment(Instr);

// Determine if the pointer operand of the access is either consecutive or		// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.		// reverse consecutive.
bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);		bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
bool ConsecutiveStride =		bool ConsecutiveStride =
Reverse \|\| (Decision == LoopVectorizationCostModel::CM_Widen);		Reverse \|\| (Decision == LoopVectorizationCostModel::CM_Widen);
Show All 16 Lines	const auto CreateVecPtr = [&](unsigned Part, Value Ptr) -> Value {
// Calculate the pointer for the specific unroll-part.		// Calculate the pointer for the specific unroll-part.
GetElementPtrInst *PartPtr = nullptr;		GetElementPtrInst *PartPtr = nullptr;

bool InBounds = false;		bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))		if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
InBounds = gep->isInBounds();		InBounds = gep->isInBounds();

if (Reverse) {		if (Reverse) {
		assert(!VF.isScalable() &&
		"Reversing vectors is not yet supported for scalable vectors.");

// If the address is consecutive but reversed, then the		// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.		// wide store needs to start at the last vector element.
PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(		PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));		ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
PartPtr->setIsInBounds(InBounds);		PartPtr->setIsInBounds(InBounds);
PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(		PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));		ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
PartPtr->setIsInBounds(InBounds);		PartPtr->setIsInBounds(InBounds);
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.		if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);		BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
} else {		} else {
PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(		Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));		PartPtr = cast<GetElementPtrInst>(
		Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
PartPtr->setIsInBounds(InBounds);		PartPtr->setIsInBounds(InBounds);
}		}

unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();		unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));		return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
};		};

// Handle Stores:		// Handle Stores:
▲ Show 20 Lines • Show All 188 Lines • ▼ Show 20 Lines	Value InnerLoopVectorizer::getOrCreateVectorTripCount(Loop L) {
if (VectorTripCount)		if (VectorTripCount)
return VectorTripCount;		return VectorTripCount;

Value *TC = getOrCreateTripCount(L);		Value *TC = getOrCreateTripCount(L);
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());		IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());

Type *Ty = TC->getType();		Type *Ty = TC->getType();
// This is where we can make the step a runtime constant.		// This is where we can make the step a runtime constant.
assert(!VF.isScalable() && "scalable vectorization is not supported yet");		Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
Constant Step = ConstantInt::get(Ty, VF.getKnownMinValue() UF);

// If the tail is to be folded by masking, round the number of iterations N		// If the tail is to be folded by masking, round the number of iterations N
// up to a multiple of Step instead of rounding down. This is done by first		// up to a multiple of Step instead of rounding down. This is done by first
// adding Step-1 and then rounding down. Note that it's ok if this addition		// adding Step-1 and then rounding down. Note that it's ok if this addition
// overflows: the vector induction variable will eventually wrap to zero given		// overflows: the vector induction variable will eventually wrap to zero given
// that it starts at zero and its Step is a power of two; the loop will then		// that it starts at zero and its Step is a power of two; the loop will then
// exit, with the last early-exit vector comparison also producing all-true.		// exit, with the last early-exit vector comparison also producing all-true.
if (Cost->foldTailByMasking()) {		if (Cost->foldTailByMasking()) {
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&		assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
"VF*UF must be a power of 2 when folding tail by masking");		"VF*UF must be a power of 2 when folding tail by masking");
		assert(!VF.isScalable() &&
		"Tail folding not yet supported for scalable vectors");
TC = Builder.CreateAdd(		TC = Builder.CreateAdd(
TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");		TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
}		}

// Now we need to generate the expression for the part of the loop that the		// Now we need to generate the expression for the part of the loop that the
// vectorized body will execute. This is equal to N - (N % Step) if scalar		// vectorized body will execute. This is equal to N - (N % Step) if scalar
// iterations are not required for correctness, or N - Step, otherwise. Step		// iterations are not required for correctness, or N - Step, otherwise. Step
// is equal to the vectorization factor (number of SIMD elements) times the		// is equal to the vectorization factor (number of SIMD elements) times the
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
// to the backedge-taken count overflowed leading to an incorrect trip count		// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.		// of zero. In this case we will also jump to the scalar loop.
auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE		auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
: ICmpInst::ICMP_ULT;		: ICmpInst::ICMP_ULT;

// If tail is to be folded, vector loop takes care of all iterations.		// If tail is to be folded, vector loop takes care of all iterations.
Value *CheckMinIters = Builder.getFalse();		Value *CheckMinIters = Builder.getFalse();
if (!Cost->foldTailByMasking()) {		if (!Cost->foldTailByMasking()) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");		Value *Step =
CheckMinIters = Builder.CreateICmp(		createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
P, Count,		CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
"min.iters.check");
}		}
// Create new preheader for vector loop.		// Create new preheader for vector loop.
LoopVectorPreHeader =		LoopVectorPreHeader =
SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,		SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
"vector.ph");		"vector.ph");

assert(DT->properlyDominates(DT->getNode(TCCheckBlock),		assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
DT->getNode(Bypass)->getIDom()) &&		DT->getNode(Bypass)->getIDom()) &&
▲ Show 20 Lines • Show All 462 Lines • ▼ Show 20 Lines	BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// - counts from zero, stepping by one		// - counts from zero, stepping by one
// - is the size of the widest induction variable type		// - is the size of the widest induction variable type
// then we create a new one.		// then we create a new one.
OldInduction = Legal->getPrimaryInduction();		OldInduction = Legal->getPrimaryInduction();
Type *IdxTy = Legal->getWidestInductionType();		Type *IdxTy = Legal->getWidestInductionType();
Value *StartIdx = ConstantInt::get(IdxTy, 0);		Value *StartIdx = ConstantInt::get(IdxTy, 0);
// The loop step is equal to the vectorization factor (num of SIMD elements)		// The loop step is equal to the vectorization factor (num of SIMD elements)
// times the unroll factor (num of SIMD instructions).		// times the unroll factor (num of SIMD instructions).
assert(!VF.isScalable() && "scalable vectors not yet supported.");		Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
Constant Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() UF);		Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);		Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
Induction =		Induction =
createInductionVariable(Lp, StartIdx, CountRoundDown, Step,		createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
getDebugLocFromInstOrOperands(OldInduction));		getDebugLocFromInstOrOperands(OldInduction));

// Emit phis for the new starting index of the scalar loop.		// Emit phis for the new starting index of the scalar loop.
createInductionResumeValues(Lp, CountRoundDown);		createInductionResumeValues(Lp, CountRoundDown);

▲ Show 20 Lines • Show All 829 Lines • ▼ Show 20 Lines	for (User *U : Cur->users()) {
if ((Cur != LoopExitInstr \|\| OrigLoop->contains(UI->getParent())) &&		if ((Cur != LoopExitInstr \|\| OrigLoop->contains(UI->getParent())) &&
Visited.insert(UI).second)		Visited.insert(UI).second)
Worklist.push_back(UI);		Worklist.push_back(UI);
}		}
}		}
}		}

void InnerLoopVectorizer::fixLCSSAPHIs() {		void InnerLoopVectorizer::fixLCSSAPHIs() {
assert(!VF.isScalable() && "the code below assumes fixed width vectors");
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {		for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
if (LCSSAPhi.getNumIncomingValues() == 1) {		if (LCSSAPhi.getNumIncomingValues() == 1) {
auto *IncomingValue = LCSSAPhi.getIncomingValue(0);		auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
// Non-instruction incoming values will have only one value.		// Non-instruction incoming values will have only one value.
unsigned LastLane = 0;		unsigned LastLane = 0;
if (isa<Instruction>(IncomingValue))		if (isa<Instruction>(IncomingValue))
LastLane = Cost->isUniformAfterVectorization(		LastLane = Cost->isUniformAfterVectorization(
cast<Instruction>(IncomingValue), VF)		cast<Instruction>(IncomingValue), VF)
? 0		? 0
: VF.getKnownMinValue() - 1;		: VF.getKnownMinValue() - 1;
		assert((!VF.isScalable() \|\| LastLane == 0) &&
		"scalable vectors dont support non-uniform scalars yet");
// Can be a loop invariant incoming value or the last scalar value to be		// Can be a loop invariant incoming value or the last scalar value to be
// extracted from the vectorized loop.		// extracted from the vectorized loop.
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());		Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
Value *lastIncomingValue =		Value *lastIncomingValue =
getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });		getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);		LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
}		}
}		}
▲ Show 20 Lines • Show All 1,136 Lines • ▼ Show 20 Lines	reportVectorizationFailure(
"vectorize(enable)' when compiling with -Os/-Oz",		"vectorize(enable)' when compiling with -Os/-Oz",
"NoTailLoopWithOptForSize", ORE, TheLoop);		"NoTailLoopWithOptForSize", ORE, TheLoop);
return None;		return None;
}		}

ElementCount		ElementCount
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,		LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF) {		ElementCount UserVF) {
assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);		MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;		unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();		std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);		unsigned WidestRegister = TTI.getRegisterBitWidth(true);

// Get the maximum safe dependence distance in bits computed by LAA.		// Get the maximum safe dependence distance in bits computed by LAA.
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from		// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest		// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).		// dependence distance).
unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();		unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();

if (UserVF.isNonZero()) {		if (UserVF.isNonZero()) {
		// For now, don't verify legality of scalable vectors.
		// This will be addressed properly in https://reviews.llvm.org/D91718.
		if (UserVF.isScalable())
		return UserVF;

// If legally unsafe, clamp the user vectorization factor to a safe value.		// If legally unsafe, clamp the user vectorization factor to a safe value.
unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);		unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
if (UserVF.getFixedValue() <= MaxSafeVF)		if (UserVF.getFixedValue() <= MaxSafeVF)
return UserVF;		return UserVF;

LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF		LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
<< " is unsafe, clamping to max safe VF=" << MaxSafeVF		<< " is unsafe, clamping to max safe VF=" << MaxSafeVF
<< ".\n");		<< ".\n");
▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
}		}
}		}
}		}
return ElementCount::getFixed(MaxVF);		return ElementCount::getFixed(MaxVF);
}		}

VectorizationFactor		VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {		LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
		// FIXME: This can be fixed for scalable vectors later, because at this stage
		dmgreenUnsubmitted Done Reply Inline Actions Perhaps add FIXME: dmgreen: Perhaps add FIXME:
		// the LoopVectorizer will only consider vectorizing a loop with scalable
		// vectors when the loop has a hint to enable vectorization for a given VF.
assert(!MaxVF.isScalable() && "scalable vectors not yet supported");		assert(!MaxVF.isScalable() && "scalable vectors not yet supported");

float Cost = expectedCost(ElementCount::getFixed(1)).first;		float Cost = expectedCost(ElementCount::getFixed(1)).first;
const float ScalarCost = Cost;		const float ScalarCost = Cost;
unsigned Width = 1;		unsigned Width = 1;
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");		LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");

bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;		bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
▲ Show 20 Lines • Show All 293 Lines • ▼ Show 20 Lines	if (EnableIndVarRegisterHeur) {
PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /		PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
std::max(1U, (MaxLocalUsers - 1)));		std::max(1U, (MaxLocalUsers - 1)));
}		}

IC = std::min(IC, TmpIC);		IC = std::min(IC, TmpIC);
}		}

// Clamp the interleave ranges to reasonable counts.		// Clamp the interleave ranges to reasonable counts.
assert(!VF.isScalable() && "scalable vectors not yet supported.");
unsigned MaxInterleaveCount =		unsigned MaxInterleaveCount =
TTI.getMaxInterleaveFactor(VF.getKnownMinValue());		TTI.getMaxInterleaveFactor(VF.getKnownMinValue());

// Check if the user has overridden the max.		// Check if the user has overridden the max.
if (VF.isScalar()) {		if (VF.isScalar()) {
if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)		if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;		MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
} else {		} else {
if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)		if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;		MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}		}

// If trip count is known or estimated compile time constant, limit the		// If trip count is known or estimated compile time constant, limit the
// interleave count to be less than the trip count divided by VF, provided it		// interleave count to be less than the trip count divided by VF, provided it
// is at least 1.		// is at least 1.
		//
		dmgreenUnsubmitted Not Done Reply Inline Actions Is it worth just not automatically interleaving for the moment? It might simplify things until the cost model is doing better, at least. dmgreen: Is it worth just not automatically interleaving for the moment? It might simplify things until…
		sdesmalenAuthorUnsubmitted Done Reply Inline Actions From a functional perspective, interleaving should be no different for scalable VFs as they are for fixed-width VFs. That means that if we keep interleaving enabled for scalable vectors, we'll be testing more functionality and code-paths in the vectorizer. We're still a way off from cost-modelling this properly, so until then I'd like to suggest we keep this enabled by default mostly for testing purposes. Does that make sense? sdesmalen: From a functional perspective, interleaving should be no different for scalable VFs as they are…
		dmgreenUnsubmitted Done Reply Inline Actions I was hoping that if we remove the need for this, we could remove the need for the "if scalable return 1" in the cost model. That way we get the testing of the cost model, which seems like a larger issue to test to me. Can we remove that cost model change already? Or are there too many paths that do not work yet? The simple loop you have in the commit message might work already. dmgreen: I was hoping that if we remove the need for this, we could remove the need for the "if scalable…
		sdesmalenAuthorUnsubmitted Done Reply Inline Actions It seems I can actually leave this in and still remove the "if scalable return 1" (at least for the unit test in this patch). sdesmalen: It seems I can actually leave this in and still remove the "if scalable return 1" (at least for…
		// For scalable vectors we can't know if interleaving is beneficial. It may
		// not be beneficial for small loops if none of the lanes in the second vector
		// iterations is enabled. However, for larger loops, there is likely to be a
		// similar benefit as for fixed-width vectors. For now, we choose to leave
		// the InterleaveCount as if vscale is '1', although if some information about
		// the vector is known (e.g. min vector size), we can make a better decision.
if (BestKnownTC) {		if (BestKnownTC) {
MaxInterleaveCount =		MaxInterleaveCount =
std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);		std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
// Make sure MaxInterleaveCount is greater than 0.		// Make sure MaxInterleaveCount is greater than 0.
MaxInterleaveCount = std::max(1u, MaxInterleaveCount);		MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
}		}

assert(MaxInterleaveCount > 0 &&		assert(MaxInterleaveCount > 0 &&
Show All 27 Lines	unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// runtime check and so interleaving won't require further checks.		// runtime check and so interleaving won't require further checks.
bool InterleavingRequiresRuntimePointerCheck =		bool InterleavingRequiresRuntimePointerCheck =
(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);		(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);

// We want to interleave small loops in order to reduce the loop overhead and		// We want to interleave small loops in order to reduce the loop overhead and
// potentially expose ILP opportunities.		// potentially expose ILP opportunities.
LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'		LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
<< "LV: IC is " << IC << '\n'		<< "LV: IC is " << IC << '\n'
<< "LV: VF is " << VF.getKnownMinValue() << '\n');		<< "LV: VF is " << VF << '\n');
		dmgreenUnsubmitted Done Reply Inline Actions Is there a better way to print this? dmgreen: Is there a better way to print this?
		sdesmalenAuthorUnsubmitted Done Reply Inline Actions Good point. I believe this can just print VF since ElementCount defines `operator<<` that adds `"vscale x "`. sdesmalen: Good point. I believe this can just print VF since ElementCount defines `operator<<` that adds…
const bool AggressivelyInterleaveReductions =		const bool AggressivelyInterleaveReductions =
TTI.enableAggressiveInterleaving(HasReductions);		TTI.enableAggressiveInterleaving(HasReductions);
if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {		if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
// We assume that the cost overhead is 1 and we use the cost model		// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the		// to estimate the cost of the loop and interleave until the cost of the
// loop overhead is about 5% of the cost of the loop.		// loop overhead is about 5% of the cost of the loop.
unsigned SmallIC =		unsigned SmallIC =
std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));		std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
▲ Show 20 Lines • Show All 650 Lines • ▼ Show 20 Lines	return TTI.getAddressComputationCost(ValTy) +
TTI::TCK_RecipThroughput, I);		TTI::TCK_RecipThroughput, I);
}		}
return getWideningCost(I, VF);		return getWideningCost(I, VF);
}		}

LoopVectorizationCostModel::VectorizationCostTy		LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::getInstructionCost(Instruction *I,		LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ElementCount VF) {		ElementCount VF) {
assert(!VF.isScalable() &&
"the cost model is not yet implemented for scalable vectorization");
// If we know that this instruction will remain uniform, check the cost of		// If we know that this instruction will remain uniform, check the cost of
// the scalar version.		// the scalar version.
if (isUniformAfterVectorization(I, VF))		if (isUniformAfterVectorization(I, VF))
VF = ElementCount::getFixed(1);		VF = ElementCount::getFixed(1);

if (VF.isVector() && isProfitableToScalarize(I, VF))		if (VF.isVector() && isProfitableToScalarize(I, VF))
return VectorizationCostTy(InstsToScalarize[VF][I], false);		return VectorizationCostTy(InstsToScalarize[VF][I], false);

▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,

// Skip operands that do not require extraction/scalarization and do not incur		// Skip operands that do not require extraction/scalarization and do not incur
// any overhead.		// any overhead.
return Cost + TTI.getOperandsScalarizationOverhead(		return Cost + TTI.getOperandsScalarizationOverhead(
filterExtractingOperands(Ops, VF), VF.getKnownMinValue());		filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
}		}

void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {		void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
if (VF.isScalar())		if (VF.isScalar())
return;		return;
NumPredStores = 0;		NumPredStores = 0;
for (BasicBlock *BB : TheLoop->blocks()) {		for (BasicBlock *BB : TheLoop->blocks()) {
// For each instruction in the old loop.		// For each instruction in the old loop.
for (Instruction &I : *BB) {		for (Instruction &I : *BB) {
Value *Ptr = getLoadStorePointerOperand(&I);		Value *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)		if (!Ptr)
▲ Show 20 Lines • Show All 570 Lines • ▼ Show 20 Lines	LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
LLVM_DEBUG(		LLVM_DEBUG(
dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "		dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n");		"VPlan-native path.\n");
return VectorizationFactor::Disabled();		return VectorizationFactor::Disabled();
}		}

Optional<VectorizationFactor>		Optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {		LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
assert(OrigLoop->isInnermost() && "Inner loop expected.");		assert(OrigLoop->isInnermost() && "Inner loop expected.");
Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);		Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.		if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
return None;		return None;

// Invalidate interleave groups if all blocks of loop will be predicated.		// Invalidate interleave groups if all blocks of loop will be predicated.
if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&		if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
!useMaskedInterleavedAccesses(*TTI)) {		!useMaskedInterleavedAccesses(*TTI)) {
LLVM_DEBUG(		LLVM_DEBUG(
dbgs()		dbgs()
<< "LV: Invalidate all interleaved groups due to fold-tail by masking "		<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n");		"which requires masked-interleaved support.\n");
if (CM.InterleaveInfo.invalidateGroups())		if (CM.InterleaveInfo.invalidateGroups())
// Invalidating interleave groups also requires invalidating all decisions		// Invalidating interleave groups also requires invalidating all decisions
// based on them, which includes widening decisions and uniform and scalar		// based on them, which includes widening decisions and uniform and scalar
// values.		// values.
CM.invalidateCostModelingDecisions();		CM.invalidateCostModelingDecisions();
}		}

ElementCount MaxVF = MaybeMaxVF.getValue();		ElementCount MaxVF = MaybeMaxVF.getValue();
assert(MaxVF.isNonZero() && "MaxVF is zero.");		assert(MaxVF.isNonZero() && "MaxVF is zero.");

if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {		if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");		LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
assert(isPowerOf2_32(UserVF.getFixedValue()) &&		assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
"VF needs to be a power of two");		"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more		// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.		// profitable to scalarize.
CM.selectUserVectorizationFactor(UserVF);		CM.selectUserVectorizationFactor(UserVF);
CM.collectInLoopReductions();		CM.collectInLoopReductions();
buildVPlansWithVPRecipes(UserVF, UserVF);		buildVPlansWithVPRecipes(UserVF, UserVF);
LLVM_DEBUG(printPlans(dbgs()));		LLVM_DEBUG(printPlans(dbgs()));
return {{UserVF, 0}};		return {{UserVF, 0}};
}		}

		assert(!MaxVF.isScalable() &&
		"Scalable vectors not yet supported beyond this point");

for (ElementCount VF = ElementCount::getFixed(1);		for (ElementCount VF = ElementCount::getFixed(1);
ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {		ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
// Collect Uniform and Scalar instructions after vectorization with VF.		// Collect Uniform and Scalar instructions after vectorization with VF.
CM.collectUniformsAndScalars(VF);		CM.collectUniformsAndScalars(VF);

// Collect the instructions (and their associated costs) that will be more		// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.		// profitable to scalarize.
if (VF.isVector())		if (VF.isVector())
▲ Show 20 Lines • Show All 1,327 Lines • ▼ Show 20 Lines	if (Kind == RecurrenceDescriptor::RK_IntegerMinMax \|\|
PrevInChain);		PrevInChain);
}		}
State.set(this, getUnderlyingInstr(), NextInChain, Part);		State.set(this, getUnderlyingInstr(), NextInChain, Part);
}		}
}		}

void VPReplicateRecipe::execute(VPTransformState &State) {		void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.		if (State.Instance) { // Generate a single instance.
		assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
		cameron.mcinallyUnsubmitted Done Reply Inline Actions Nit: Yank spelling of `scalarize`? cameron.mcinally: Nit: Yank spelling of `scalarize`?
		sdesmalenAuthorUnsubmitted Done Reply Inline Actions You're right, I'm mixing up American and British spelling in this patch, thanks for spotting! sdesmalen: You're right, I'm mixing up American and British spelling in this patch, thanks for spotting!
State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,		State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
*State.Instance, IsPredicated, State);		*State.Instance, IsPredicated, State);
// Insert scalar instance packing it into a vector.		// Insert scalar instance packing it into a vector.
if (AlsoPack && State.VF.isVector()) {		if (AlsoPack && State.VF.isVector()) {
// If we're constructing lane 0, initialize to start from undef.		// If we're constructing lane 0, initialize to start from undef.
if (State.Instance->Lane == 0) {		if (State.Instance->Lane == 0) {
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");		assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
Value *Undef = UndefValue::get(		Value *Undef = UndefValue::get(
VectorType::get(getUnderlyingValue()->getType(), State.VF));		VectorType::get(getUnderlyingValue()->getType(), State.VF));
State.ValueMap.setVectorValue(getUnderlyingInstr(),		State.ValueMap.setVectorValue(getUnderlyingInstr(),
State.Instance->Part, Undef);		State.Instance->Part, Undef);
}		}
State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),		State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
*State.Instance);		*State.Instance);
}		}
return;		return;
}		}

// Generate scalar instances for all VF lanes of all UF parts, unless the		// Generate scalar instances for all VF lanes of all UF parts, unless the
// instruction is uniform inwhich case generate only the first lane for each		// instruction is uniform inwhich case generate only the first lane for each
// of the UF parts.		// of the UF parts.
unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();		unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
		assert((!State.VF.isScalable() \|\| IsUniform) &&
		"Can't scalarize a scalable vector");
		cameron.mcinallyUnsubmitted Done Reply Inline Actions Same here. cameron.mcinally: Same here.
for (unsigned Part = 0; Part < State.UF; ++Part)		for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)		for (unsigned Lane = 0; Lane < EndLane; ++Lane)
State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},		State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
IsPredicated, State);		IsPredicated, State);
}		}

void VPBranchOnMaskRecipe::execute(VPTransformState &State) {		void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Branch on Mask works only on single instance.");		assert(State.Instance && "Branch on Mask works only on single instance.");
▲ Show 20 Lines • Show All 137 Lines • ▼ Show 20 Lines	LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI);		&Hints, IAI);
// Use the planner for outer loop vectorization.		// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an		// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.		// optional argument if we don't need it in the future.
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);		LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);

// Get user vectorization factor.		// Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();		ElementCount UserVF = Hints.getWidth();
if (UserVF.isScalable()) {
// TODO: Use scalable UserVF once we've added initial support for scalable
// vectorization. For now we convert it to fixed width, but this will be
// removed in a later patch.
UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
}

// Plan how to best vectorize, return the best VF and its cost.		// Plan how to best vectorize, return the best VF and its cost.
const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);		const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);

// If we are stress testing VPlan builds, do not attempt to generate vector		// If we are stress testing VPlan builds, do not attempt to generate vector
// code. Masked vector code generation support will follow soon.		// code. Masked vector code generation support will follow soon.
// Also, do not attempt to vectorize if no vector code will be produced.		// Also, do not attempt to vectorize if no vector code will be produced.
if (VPlanBuildStressTest \|\| EnableVPlanPredication \|\|		if (VPlanBuildStressTest \|\| EnableVPlanPredication \|\|
▲ Show 20 Lines • Show All 149 Lines • ▼ Show 20 Lines	LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
F, &Hints, IAI);		F, &Hints, IAI);
CM.collectValuesToIgnore();		CM.collectValuesToIgnore();

// Use the planner for vectorization.		// Use the planner for vectorization.
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);		LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);

// Get user vectorization factor and interleave count.		// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();		ElementCount UserVF = Hints.getWidth();
if (UserVF.isScalable()) {
// TODO: Use scalable UserVF once we've added initial support for scalable
// vectorization. For now we convert it to fixed width, but this will be
// removed in a later patch.
UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
}

unsigned UserIC = Hints.getInterleave();		unsigned UserIC = Hints.getInterleave();

// Plan how to best vectorize, return the best VF and its cost.		// Plan how to best vectorize, return the best VF and its cost.
Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);		Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);

VectorizationFactor VF = VectorizationFactor::Disabled();		VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;		unsigned IC = 1;

▲ Show 20 Lines • Show All 302 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/VPlan.h

Show First 20 Lines • Show All 157 Lines • ▼ Show 20 Lines	bool hasAnyScalarValue(Value *Key) const {
return ScalarMapStorage.count(Key);		return ScalarMapStorage.count(Key);
}		}

/// \return True if the map has a scalar entry for \p Key and \p Instance.		/// \return True if the map has a scalar entry for \p Key and \p Instance.
bool hasScalarValue(Value *Key, const VPIteration &Instance) const {		bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
assert(Instance.Part < UF && "Queried Scalar Part is too large.");		assert(Instance.Part < UF && "Queried Scalar Part is too large.");
assert(Instance.Lane < VF.getKnownMinValue() &&		assert(Instance.Lane < VF.getKnownMinValue() &&
"Queried Scalar Lane is too large.");		"Queried Scalar Lane is too large.");
assert(!VF.isScalable() && "VF is assumed to be non scalable.");

if (!hasAnyScalarValue(Key))		if (!hasAnyScalarValue(Key))
return false;		return false;
const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;		const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");		assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&		assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&
"ScalarParts has wrong dimensions.");		"ScalarParts has wrong dimensions.");
return Entry[Instance.Part][Instance.Lane] != nullptr;		return Entry[Instance.Part][Instance.Lane] != nullptr;
▲ Show 20 Lines • Show All 1,947 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/metadata-width.ll

	; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S \| FileCheck %s			; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S \| FileCheck %s

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

	; CHECK-LABEL: @test1(			; CHECK-LABEL: @test1(
	; CHECK: store <8 x i32>			; CHECK: store <8 x i32>
	; CHECK: ret void			; CHECK: ret void
	define void @test1(i32* nocapture %a, i32 %n) #0 {			define void @test1(i32* nocapture %a, i32 %n) #0 {
	entry:			entry:
	%cmp4 = icmp sgt i32 %n, 0			%cmp4 = icmp sgt i32 %n, 0
	br i1 %cmp4, label %for.body, label %for.end			br i1 %cmp4, label %for.body, label %for.end

	for.body: ; preds = %entry, %for.body			for.body: ; preds = %entry, %for.body
	%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]			%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
	%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv			%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
	%0 = trunc i64 %indvars.iv to i32			store i32 42, i32* %arrayidx, align 4
	store i32 %0, i32* %arrayidx, align 4
	%indvars.iv.next = add i64 %indvars.iv, 1			%indvars.iv.next = add i64 %indvars.iv, 1
	%lftr.wideiv = trunc i64 %indvars.iv.next to i32			%lftr.wideiv = trunc i64 %indvars.iv.next to i32
	%exitcond = icmp eq i32 %lftr.wideiv, %n			%exitcond = icmp eq i32 %lftr.wideiv, %n
	br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0			br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0

	for.end: ; preds = %for.body, %entry			for.end: ; preds = %for.body, %entry
	ret void			ret void
	}			}

	; CHECK-LABEL: @test2(			; CHECK-LABEL: @test2(
	; CHECK: store <8 x i32>			; CHECK: store <vscale x 8 x i32>
	; CHECK: ret void			; CHECK: ret void
	define void @test2(i32* nocapture %a, i32 %n) #0 {			define void @test2(i32* nocapture %a, i32 %n) #0 {
	entry:			entry:
	%cmp4 = icmp sgt i32 %n, 0			%cmp4 = icmp sgt i32 %n, 0
	br i1 %cmp4, label %for.body, label %for.end			br i1 %cmp4, label %for.body, label %for.end

	for.body: ; preds = %entry, %for.body			for.body: ; preds = %entry, %for.body
	%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]			%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
	%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv			%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
	%0 = trunc i64 %indvars.iv to i32			store i32 42, i32* %arrayidx, align 4
	store i32 %0, i32* %arrayidx, align 4
	%indvars.iv.next = add i64 %indvars.iv, 1			%indvars.iv.next = add i64 %indvars.iv, 1
	%lftr.wideiv = trunc i64 %indvars.iv.next to i32			%lftr.wideiv = trunc i64 %indvars.iv.next to i32
	%exitcond = icmp eq i32 %lftr.wideiv, %n			%exitcond = icmp eq i32 %lftr.wideiv, %n
	br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2			br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2

	for.end: ; preds = %for.body, %entry			for.end: ; preds = %for.body, %entry
	ret void			ret void
	}			}

	; CHECK-LABEL: @test3(			; CHECK-LABEL: @test3(
	; CHECK: store <8 x i32>			; CHECK: store <8 x i32>
	; CHECK: ret void			; CHECK: ret void
	define void @test3(i32* nocapture %a, i32 %n) #0 {			define void @test3(i32* nocapture %a, i32 %n) #0 {
	entry:			entry:
	%cmp4 = icmp sgt i32 %n, 0			%cmp4 = icmp sgt i32 %n, 0
	br i1 %cmp4, label %for.body, label %for.end			br i1 %cmp4, label %for.body, label %for.end

	for.body: ; preds = %entry, %for.body			for.body: ; preds = %entry, %for.body
	%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]			%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
	%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv			%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
	%0 = trunc i64 %indvars.iv to i32			store i32 42, i32* %arrayidx, align 4
	store i32 %0, i32* %arrayidx, align 4
	%indvars.iv.next = add i64 %indvars.iv, 1			%indvars.iv.next = add i64 %indvars.iv, 1
	%lftr.wideiv = trunc i64 %indvars.iv.next to i32			%lftr.wideiv = trunc i64 %indvars.iv.next to i32
	%exitcond = icmp eq i32 %lftr.wideiv, %n			%exitcond = icmp eq i32 %lftr.wideiv, %n
	br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4			br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4

	for.end: ; preds = %for.body, %entry			for.end: ; preds = %for.body, %entry
	ret void			ret void
	}			}
	Show All 9 Lines

llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll

This file was added.

				; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s \| FileCheck %s --check-prefix=CHECKUF1
				; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s \| FileCheck %s --check-prefix=CHECKUF2

				; CHECKUF1: for.body.preheader:
				; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64
				; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
				; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
				; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count

				; CHECKUF1: vector.ph:
				; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
				; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
				; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
				; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf

				; CHECKUF1: vector.body:
				; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; CHECKUF1: %[[IDXB:.]] = getelementptr inbounds double, double %b, i64 %index
				; CHECKUF1: %[[IDXB_CAST:.]] = bitcast double %[[IDXB]] to <vscale x 4 x double>*
				; CHECKUF1: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
				; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
				; CHECKUF1: %[[IDXA:.]] = getelementptr inbounds double, double %a, i64 %index
				; CHECKUF1: %[[IDXA_CAST:.]] = bitcast double %[[IDXA]] to <vscale x 4 x double>*
				; CHECKUF1: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
				; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
				; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
				; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]]
				; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
				; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5


				; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).
				; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.

				; CHECKUF2: for.body.preheader:
				; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64
				; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
				; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
				; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count

				; CHECKUF2: vector.ph:
				; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
				; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
				; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
				; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf

				; CHECKUF2: vector.body:
				; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; CHECKUF2: %[[IDXB:.]] = getelementptr inbounds double, double %b, i64 %index
				; CHECKUF2: %[[IDXB_CAST:.]] = bitcast double %[[IDXB]] to <vscale x 4 x double>*
				; CHECKUF2: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
				; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
				; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
				; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
				; CHECKUF2: %[[IDXB_NEXT:.]] = getelementptr inbounds double, double %[[IDXB]], i64 %[[VSCALE2_EXT]]
				; CHECKUF2: %[[IDXB_NEXT_CAST:.]] = bitcast double %[[IDXB_NEXT]] to <vscale x 4 x double>*
				; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0
				; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
				; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
				; CHECKUF2: %[[IDXA:.]] = getelementptr inbounds double, double %a, i64 %index
				; CHECKUF2: %[[IDXA_CAST:.]] = bitcast double %[[IDXA]] to <vscale x 4 x double>*
				; CHECKUF2: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
				; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
				; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
				; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
				; CHECKUF2: %[[IDXA_NEXT:.]] = getelementptr inbounds double, double %[[IDXA]], i64 %[[VSCALE2_EXT]]
				; CHECKUF2: %[[IDXA_NEXT_CAST:.]] = bitcast double %[[IDXA_NEXT]] to <vscale x 4 x double>*
				; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], <vscale x 4 x double>* %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0
				; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
				; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
				; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]]
				; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
				; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5

				define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
				entry:
				%cmp7 = icmp sgt i32 %N, 0
				br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

				for.body.preheader: ; preds = %entry
				%wide.trip.count = zext i32 %N to i64
				br label %for.body

				for.cond.cleanup: ; preds = %for.body, %entry
				ret void

				for.body: ; preds = %for.body.preheader, %for.body
				%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
				%arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
				%0 = load double, double* %arrayidx, align 8
				%add = fadd double %0, 1.000000e+00
				%arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
				store double %add, double* %arrayidx2, align 8
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
				br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
				}

				!1 = distinct !{!1, !2, !3}
				!2 = !{!"llvm.loop.vectorize.width", i32 4}
				!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorizer][SVE] Vectorize a simple loop with with a scalable VF.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 310485

llvm/include/llvm/IR/IRBuilder.h

llvm/lib/IR/IRBuilder.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/VPlan.h

llvm/test/Transforms/LoopVectorize/metadata-width.ll

llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorizer][SVE] Vectorize a simple loop with with a scalable VF.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 310485

llvm/include/llvm/IR/IRBuilder.h

llvm/lib/IR/IRBuilder.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/VPlan.h

llvm/test/Transforms/LoopVectorize/metadata-width.ll

llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll

[LoopVectorizer][SVE] Vectorize a simple loop with with a scalable VF.
ClosedPublic