Diff 222118

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 109 Lines • ▼ Show 20 Lines
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"		#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"		#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"		#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"
		#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"		#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"		#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"		#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"		#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"		#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"		#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"		#include "llvm/IR/ValueHandle.h"
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines	static cl::opt<bool> EnableInterleavedMemAccesses(
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));		cl::desc("Enable vectorization on interleaved memory accesses in a loop"));

/// An interleave-group may need masking if it resides in a block that needs		/// An interleave-group may need masking if it resides in a block that needs
/// predication, or in order to mask away gaps.		/// predication, or in order to mask away gaps.
static cl::opt<bool> EnableMaskedInterleavedMemAccesses(		static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,		"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));		cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));

/// We don't interleave loops with a known constant trip count below this		static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
/// number.		"tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
static const unsigned TinyTripCountInterleaveThreshold = 128;		cl::desc("We don't interleave loops with a estimated constant trip count "
		"below this number"));

static cl::opt<unsigned> ForceTargetNumScalarRegs(		static cl::opt<unsigned> ForceTargetNumScalarRegs(
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,		"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of scalar registers."));		cl::desc("A flag that overrides the target's number of scalar registers."));

static cl::opt<unsigned> ForceTargetNumVectorRegs(		static cl::opt<unsigned> ForceTargetNumVectorRegs(
"force-target-num-vector-regs", cl::init(0), cl::Hidden,		"force-target-num-vector-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of vector registers."));		cl::desc("A flag that overrides the target's number of vector registers."));
▲ Show 20 Lines • Show All 283 Lines • ▼ Show 20 Lines	public:

/// Set the debug location in the builder using the debug location in		/// Set the debug location in the builder using the debug location in
/// the instruction.		/// the instruction.
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);		void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);

/// Fix the non-induction PHIs in the OrigPHIsToFix vector.		/// Fix the non-induction PHIs in the OrigPHIsToFix vector.
void fixNonInductionPHIs(void);		void fixNonInductionPHIs(void);

		/// Update profile info of the vector and prolog/epilog loops.
		void fixProfileInfo();

protected:		protected:
friend class LoopVectorizationPlanner;		friend class LoopVectorizationPlanner;

/// A small list of PHINodes.		/// A small list of PHINodes.
using PhiVector = SmallVector<PHINode *, 4>;		using PhiVector = SmallVector<PHINode *, 4>;

/// A type for scalarized values in the new loop. Each value from the		/// A type for scalarized values in the new loop. Each value from the
/// original loop, when scalarized, is represented by UF x VF scalar values		/// original loop, when scalarized, is represented by UF x VF scalar values
▲ Show 20 Lines • Show All 421 Lines • ▼ Show 20 Lines
enum ScalarEpilogueLowering {		enum ScalarEpilogueLowering {

// The default: allowing scalar epilogues.		// The default: allowing scalar epilogues.
CM_ScalarEpilogueAllowed,		CM_ScalarEpilogueAllowed,

// Vectorization with OptForSize: don't allow epilogues.		// Vectorization with OptForSize: don't allow epilogues.
CM_ScalarEpilogueNotAllowedOptSize,		CM_ScalarEpilogueNotAllowedOptSize,

// A special case of vectorization with OptForSize: loops with a very small		// A special case of vectorization with OptForSize: loops with a very small
		ebrevnovAuthorUnsubmitted Done Reply Inline Actions just to be consistent with the next line :-) ebrevnov: just to be consistent with the next line :-)
// trip count are considered for vectorization under OptForSize, thereby		// trip count are considered for vectorization under OptForSize, thereby
// making sure the cost of their loop body is dominant, free of runtime		// making sure the cost of their loop body is dominant, free of runtime
// guards and scalar iteration overheads.		// guards and scalar iteration overheads.
CM_ScalarEpilogueNotAllowedLowTripLoop,		CM_ScalarEpilogueNotAllowedLowTripLoop,

// Loop hint predicate indicating an epilogue is undesired.		// Loop hint predicate indicating an epilogue is undesired.
CM_ScalarEpilogueNotNeededUsePredicate		CM_ScalarEpilogueNotNeededUsePredicate
};		};
▲ Show 20 Lines • Show All 2,501 Lines • ▼ Show 20 Lines	fixupIVUsers(Entry.first, Entry.second,
IVEndValues[Entry.first], LoopMiddleBlock);		IVEndValues[Entry.first], LoopMiddleBlock);

fixLCSSAPHIs();		fixLCSSAPHIs();
for (Instruction *PI : PredicatedInstructions)		for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);		sinkScalarOperands(&*PI);

// Remove redundant induction instructions.		// Remove redundant induction instructions.
cse(LoopVectorBody);		cse(LoopVectorBody);

		fixProfileInfo();
}		}

void InnerLoopVectorizer::fixCrossIterationPHIs() {		void InnerLoopVectorizer::fixCrossIterationPHIs() {
// In order to support recurrences we need to be able to vectorize Phi nodes.		// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is		// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #2: We now need to fix the recurrences by adding incoming edges to		// stage #2: We now need to fix the recurrences by adding incoming edges to
// the currently empty PHI nodes. At this point every instruction in the		// the currently empty PHI nodes. At this point every instruction in the
// original loop is widened to a vector form so we can use them to construct		// original loop is widened to a vector form so we can use them to construct
▲ Show 20 Lines • Show All 493 Lines • ▼ Show 20 Lines	for (unsigned i = 0; i < NumIncomingValues; ++i) {

// Scalar incoming value may need a broadcast		// Scalar incoming value may need a broadcast
Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);		Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
NewPhi->addIncoming(NewIncV, NewPredBB);		NewPhi->addIncoming(NewIncV, NewPredBB);
}		}
}		}
}		}

		// Update profile info since expected TC of vectorized loop is less by VFxUF
		// than original TC. At the same time original loop becomes prolog/epilog loop
		// and may not have more than VFxUF iterations.
		void InnerLoopVectorizer::fixProfileInfo() {
		uint64_t OrigTakenCount = 0;
		uint64_t OrigFallThroughCount = 0;
		Loop *OrigLoop = LI->getLoopFor(LoopScalarBody);
		auto *OrigBackBranchI = OrigLoop->getLoopLatch()->getTerminator();
		if (OrigBackBranchI->extractProfMetadata(OrigTakenCount,
		OrigFallThroughCount) &&
		OrigFallThroughCount != 0) {
		Loop *VecLoop = LI->getLoopFor(LoopVectorBody);
		MDBuilder MDB(OrigBackBranchI->getContext());
		auto *VectorBackBranchI = VecLoop->getLoopLatch()->getTerminator();
		bool IsTrueBackEdgeOrigLoop =
		OrigLoop->contains(*succ_begin(OrigLoop->getLoopLatch()));
		bool IsTrueBackEdgeVecLoop =
		VecLoop->contains(*succ_begin(VecLoop->getLoopLatch()));

		if (!IsTrueBackEdgeOrigLoop)
		std::swap(OrigTakenCount, OrigFallThroughCount);

		// Uses of OrigIterCount bellow should not be simplified as it will
		// produce a different value. In other words: (A mod N) * B != (A*B) mod N
		const uint64_t OrigIterCount = OrigTakenCount / OrigFallThroughCount;
		// Calculate taken and fall through counts for vector loop.
		uint64_t VecTakenCount = (OrigIterCount / (VF * UF)) * OrigFallThroughCount;
		uint64_t VecFallThrough = OrigFallThroughCount;
		// Now calculate counters for prolog/epilog loop.
		OrigTakenCount = (OrigIterCount % (VF * UF)) * OrigFallThroughCount;

		// Adjust number of iterations in vector and epilog loops if epilog
		// iterations executed as part of the main loop.
		if (OrigTakenCount != 0 && Cost->foldTailByMasking()) {
		++VecTakenCount;
		OrigTakenCount = 0;
		}
		// Bottom test is never reached if loop has zero iterations.
		if (VecTakenCount == 0)
		VecFallThrough = 0;
		// Bottom test is never reached if loop has zero iterations.
		if (OrigTakenCount == 0)
		OrigFallThroughCount = 0;
		// Make a swap if back edge is taken when condition "false".
		if (!IsTrueBackEdgeVecLoop)
		std::swap(VecTakenCount, VecFallThrough);
		// Set new profile metadata.
		VectorBackBranchI->setMetadata(
		LLVMContext::MD_prof,
		MDB.createBranchWeights(VecTakenCount, VecFallThrough));
		// Make a swap if back edge is taken when condition "false".
		if (!IsTrueBackEdgeOrigLoop)
		std::swap(OrigTakenCount, OrigFallThroughCount);
		// Set new profile metadata.
		OrigBackBranchI->setMetadata(
		LLVMContext::MD_prof,
		MDB.createBranchWeights(OrigTakenCount, OrigFallThroughCount));
		}
		}

void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,		void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
unsigned VF) {		unsigned VF) {
PHINode *P = cast<PHINode>(PN);		PHINode *P = cast<PHINode>(PN);
if (EnableVPlanNativePath) {		if (EnableVPlanNativePath) {
// Currently we enter here in the VPlan-native path for non-induction		// Currently we enter here in the VPlan-native path for non-induction
// PHIs where all control flow is uniform. We simply widen these PHIs.		// PHIs where all control flow is uniform. We simply widen these PHIs.
// Create a vector phi with no operands - the vector phi operands will be		// Create a vector phi with no operands - the vector phi operands will be
// set at the end of vector code generation.		// set at the end of vector code generation.
▲ Show 20 Lines • Show All 1,161 Lines • ▼ Show 20 Lines	unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
if (!isScalarEpilogueAllowed())		if (!isScalarEpilogueAllowed())
return 1;		return 1;

// We used the distance for the interleave count.		// We used the distance for the interleave count.
if (Legal->getMaxSafeDepDistBytes() != -1U)		if (Legal->getMaxSafeDepDistBytes() != -1U)
return 1;		return 1;

// Do not interleave loops with a relatively small trip count.		// Do not interleave loops with a relatively small trip count.
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);		auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
if (TC > 1 && TC < TinyTripCountInterleaveThreshold)		if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
return 1;		return 1;

unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);		unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters		LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
<< " registers\n");		<< " registers\n");

if (VF == 1) {		if (VF == 1) {
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)		if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
Show All 36 Lines	if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;		MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
} else {		} else {
if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)		if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;		MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}		}

// If the trip count is constant, limit the interleave count to be less than		// If the trip count is constant, limit the interleave count to be less than
// the trip count divided by VF.		// the trip count divided by VF.
if (TC > 0) {		if (BestKnownTC) {
assert(TC >= VF && "VF exceeds trip count?");		MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
if ((TC / VF) < MaxInterleaveCount)
MaxInterleaveCount = (TC / VF);
}		}

// If we did not calculate the cost for VF (because the user selected the VF)		// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.		// then we calculate the cost of VF here.
if (LoopCost == 0)		if (LoopCost == 0)
LoopCost = expectedCost(VF).first;		LoopCost = expectedCost(VF).first;

assert(LoopCost && "Non-zero loop cost expected");		assert(LoopCost && "Non-zero loop cost expected");
▲ Show 20 Lines • Show All 2,171 Lines • ▼ Show 20 Lines	void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);		State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
}		}

static ScalarEpilogueLowering		static ScalarEpilogueLowering
getScalarEpilogueLowering(Function F, Loop L, LoopVectorizeHints &Hints,		getScalarEpilogueLowering(Function F, Loop L, LoopVectorizeHints &Hints,
ProfileSummaryInfo PSI, BlockFrequencyInfo BFI,		ProfileSummaryInfo PSI, BlockFrequencyInfo BFI,
ScalarEvolution &SE) {		ScalarEvolution &SE) {
ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;		ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
auto IsColdByProfile = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI);		auto IsColdByProfile = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI);
		reamesUnsubmitted Not Done Reply Inline Actions Explicit type this please for readability. reames: Explicit type this please for readability.
if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&		if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
(F->hasOptSize() \|\| IsColdByProfile.getValueOr(false)))		(F->hasOptSize() \|\| IsColdByProfile.getValueOr(false)))
SEL = CM_ScalarEpilogueNotAllowedOptSize;		SEL = CM_ScalarEpilogueNotAllowedOptSize;
else if (PreferPredicateOverEpilog \|\| Hints.getPredicate())		else if (PreferPredicateOverEpilog \|\| Hints.getPredicate())
SEL = CM_ScalarEpilogueNotNeededUsePredicate;		SEL = CM_ScalarEpilogueNotNeededUsePredicate;
else {		else {
auto ExpectedTC = getSmallBestKnownTC(SE, L);		auto ExpectedTC = getSmallBestKnownTC(SE, L);
// Check the loop for a trip count threshold: vectorize loops with a tiny		// Check the loop for a trip count threshold: vectorize loops with a tiny
// trip count by optimizing for size, to minimize overheads.		// trip count by optimizing for size, to minimize overheads.
if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {		if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
// Even short trip count loops may be hot (part of hot region).		// Even short trip count loops may be hot (part of hot region).
// In absence of profile summary estimate loop hotness relative to		// In absence of profile summary estimate loop hotness relative to
// function entry using execution frequency information.		// function entry using execution frequency information.
if (!IsColdByProfile && LoopVectorizeWithBlockFrequency && BFI) {		if (!IsColdByProfile && LoopVectorizeWithBlockFrequency && BFI) {
Optional<uint64_t> LoopCount =		Optional<uint64_t> LoopCount =
BFI->getBlockProfileCount(L->getHeader(), true);		BFI->getBlockProfileCount(L->getHeader(), true);
Optional<uint64_t> FunctionCount =		Optional<uint64_t> FunctionCount =
BFI->getBlockProfileCount(&F->getEntryBlock(), true);		BFI->getBlockProfileCount(&F->getEntryBlock(), true);
if (LoopCount && FunctionCount &&		if (LoopCount && FunctionCount &&
(LoopCount > FunctionCount * LocalHotnessThreshold)) {		(LoopCount > FunctionCount * LocalHotnessThreshold)) {
		ebrevnovAuthorUnsubmitted Done Reply Inline Actions Some rational for the chosen heuristic. In general if nobody actually asked to optimize for size it seems reasonable to relay on cost model to decide if vectorization is profitable or not even for short trip count loops. If we still want to have some balance between code bloat and performance we should decide based on potential gain and loop size for all loops. Even though the described approach looks simple and reasonable in theory it most likely will have big implications on existing apps. That's why I decided take more conservative approach and give a chance for hot loops to be vectorized. ebrevnov: Some rational for the chosen heuristic. In general if nobody actually asked to optimize for…
LLVM_DEBUG(dbgs() << "Allow epilog for short trip count loop due to "		LLVM_DEBUG(dbgs() << "Allow epilog for short trip count loop due to "
"hotness considerations.");		"hotness considerations.");
return CM_ScalarEpilogueAllowed;		return CM_ScalarEpilogueAllowed;
}		}
}		}

LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "		LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "		<< "This loop is worth vectorizing only if no scalar "
▲ Show 20 Lines • Show All 455 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll

	; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 \| FileCheck %s			; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 \| FileCheck %s
	; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 \| FileCheck %s			; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 \| FileCheck %s

	; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300)			; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300)
	; CHECK: remark: no_fpmath.c:6:14: loop not vectorized			; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
	; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300)			; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 1) (hotness: 300)

	target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-apple-macosx10.10.0"			target triple = "x86_64-apple-macosx10.10.0"

	; Function Attrs: nounwind readonly ssp uwtable			; Function Attrs: nounwind readonly ssp uwtable
	define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 !prof !29 {			define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 !prof !29 {
	entry:			entry:
	%cmp.7 = icmp sgt i32 %n, 0, !dbg !3			%cmp.7 = icmp sgt i32 %n, 0, !dbg !3
	▲ Show 20 Lines • Show All 99 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/check-prof-info.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s \| FileCheck %s
				; RUN: opt -passes="print<block-freq>,loop-vectorize" -prefer-predicate-over-epilog=true -force-vector-width=4 -force-vector-interleave=1 -S < %s \| FileCheck %s -check-prefix=CHECK-MASKED

				target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-unknown-linux-gnu"

				@a = dso_local global [1024 x i32] zeroinitializer, align 16
				@b = dso_local global [1024 x i32] zeroinitializer, align 16

				; Check correctness of profile info for vectorization without epilog.
				; Function Attrs: nofree norecurse nounwind uwtable
				define dso_local void @_Z3foov() local_unnamed_addr #0 {
				; CHECK-LABEL: @_Z3foov(
				; CHECK: [[VECTOR_BODY:vector\.body]]:
				; CHECK: br i1 [[TMP:%.]], label [[MIDDLE_BLOCK:%.]], label %[[VECTOR_BODY]], !prof [[LP1_256:\!.*]],
				; CHECK: [[FOR_BODY:for\.body]]:
				; CHECK: br i1 [[EXITCOND:%.]], label [[FOR_END_LOOPEXIT:%.]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
				; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]:
				; CHECK-MASKED: br i1 [[TMP:%.]], label [[MIDDLE_BLOCK:%.]], label %[[VECTOR_BODY]], !prof [[LP1_256:\!.*]],
				; CHECK-MASKED: [[FOR_BODY:for\.body]]:
				; CHECK-MASKED: br i1 [[EXITCOND:%.]], label [[FOR_END_LOOPEXIT:%.]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
				;
				entry:
				br label %for.body

				for.cond.cleanup: ; preds = %for.body
				ret void

				for.body: ; preds = %for.body, %entry
				%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
				%arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv
				%0 = load i32, i32* %arrayidx, align 4, !tbaa !2
				%1 = trunc i64 %indvars.iv to i32
				%mul = mul nsw i32 %0, %1
				%arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
				%2 = load i32, i32* %arrayidx2, align 4, !tbaa !2
				%add = add nsw i32 %2, %mul
				store i32 %add, i32* %arrayidx2, align 4, !tbaa !2
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp eq i64 %indvars.iv.next, 1024
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !6
				}

				; Check correctness of profile info for vectorization with epilog.
				; Function Attrs: nofree norecurse nounwind uwtable
				define dso_local void @_Z3foo2v() local_unnamed_addr #0 {
				; CHECK-LABEL: @_Z3foo2v(
				; CHECK: [[VECTOR_BODY:vector\.body]]:
				; CHECK: br i1 [[TMP:%.]], label [[MIDDLE_BLOCK:%.]], label %[[VECTOR_BODY]], !prof [[LP1_256:\!.*]],
				; CHECK: [[FOR_BODY:for\.body]]:
				; CHECK: br i1 [[EXITCOND:%.]], label [[FOR_END_LOOPEXIT:%.]], label %[[FOR_BODY]], !prof [[LP1_3:\!.*]],
				; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]:
				; CHECK-MASKED: br i1 [[TMP:%.]], label [[MIDDLE_BLOCK:%.]], label %[[VECTOR_BODY]], !prof [[LP1_257:\!.*]],
				; CHECK-MASKED: [[FOR_BODY:for\.body]]:
				; CHECK-MASKED: br i1 [[EXITCOND:%.]], label [[FOR_END_LOOPEXIT:%.]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
				;
				entry:
				br label %for.body

				for.cond.cleanup: ; preds = %for.body
				ret void

				for.body: ; preds = %for.body, %entry
				%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
				%arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv
				%0 = load i32, i32* %arrayidx, align 4, !tbaa !2
				%1 = trunc i64 %indvars.iv to i32
				%mul = mul nsw i32 %0, %1
				%arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
				%2 = load i32, i32* %arrayidx2, align 4, !tbaa !2
				%add = add nsw i32 %2, %mul
				store i32 %add, i32* %arrayidx2, align 4, !tbaa !2
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp eq i64 %indvars.iv.next, 1027
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !7
				}

				attributes #0 = { "use-soft-float"="false" }

				!llvm.module.flags = !{!0}
				!llvm.ident = !{!1}

				; CHECK: [[LP1_256]] = !{!"branch_weights", i32 1, i32 256}
				; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
				; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
				; CHECK: [[LP1_3]] = !{!"branch_weights", i32 1, i32 3}
				; CHECK-MASKED: [[LP1_257]] = !{!"branch_weights", i32 1, i32 257}

				!0 = !{i32 1, !"wchar_size", i32 4}
				!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"}
				!2 = !{!3, !3, i64 0}
				!3 = !{!"int", !4, i64 0}
				!4 = !{!"omnipotent char", !5, i64 0}
				!5 = !{!"Simple C++ TBAA"}
				!6 = !{!"branch_weights", i32 1, i32 1024}
				!7 = !{!"branch_weights", i32 1, i32 1027}

llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes="print<block-freq>,loop-vectorize" -S < %s 2>&1 \| FileCheck %s		; RUN: opt -passes="print<block-freq>,loop-vectorize" -S < %s 2>&1 \| FileCheck %s

; Check vectorization of hot short trip count with epilog. In this case inner		; Check vectorization of hot short trip count with epilog. In this case inner
; loop trip count is not constant and its value is estimated by profile.		; loop trip count is not constant and its value is estimated by profile.

; ModuleID = 'test.cpp'		; ModuleID = 'test.cpp'
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"		target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"		target triple = "x86_64-unknown-linux-gnu"

@a = dso_local global [5 x i32] zeroinitializer, align 16		@a = dso_local global [5 x i32] zeroinitializer, align 16
@b = dso_local global [5 x i32] zeroinitializer, align 16		@b = dso_local global [5 x i32] zeroinitializer, align 16

; Function Attrs: uwtable		; Function Attrs: uwtable
define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 {		define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 {
		; CHECK-LABEL: @_Z3fooi(
		; CHECK: [[VECTOR_BODY:vector\.body]]:
; CHECK: [[WIDE_LOAD:%.]] = load <4 x i32>, <4 x i32> [[TMP15:%.*]]		; CHECK: [[WIDE_LOAD:%.]] = load <4 x i32>, <4 x i32> [[TMP15:%.*]]
; CHECK: [[TMP18:%.]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.]]		; CHECK: [[TMP18:%.]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.]]
; CHECK: [[WIDE_LOAD10:%.]] = load <4 x i32>, <4 x i32> [[TMP23:%.*]]		; CHECK: [[WIDE_LOAD10:%.]] = load <4 x i32>, <4 x i32> [[TMP23:%.*]]
; CHECK: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]]		; CHECK: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]]
; CHECK: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]]		; CHECK: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]]
;		; CHECK: br i1 [[TMP30:%.]], label [[MIDDLE_BLOCK:%.]], label %[[VECTOR_BODY]], !prof [[LP12:\!.*]],
		; CHECK: [[SCAL_BODY:for\.body4\.us]]:
		; CHECK: br i1 [[EXITCOND:%.]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.]], label %[[SCAL_BODY]], !prof [[LP15:\!.*]],
entry:		entry:
%a = alloca [5 x i32], align 16		%a = alloca [5 x i32], align 16
%b = alloca [5 x i32], align 16		%b = alloca [5 x i32], align 16
%0 = bitcast [5 x i32]* %a to i8*		%0 = bitcast [5 x i32]* %a to i8*
call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3		call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3
%1 = bitcast [5 x i32]* %b to i8*		%1 = bitcast [5 x i32]* %b to i8*
call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3		call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3
%arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0		%arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0
Show All 39 Lines	for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit
ret void		ret void
}		}

; Check vectorization of hot short trip count with epilog. In this case inner		; Check vectorization of hot short trip count with epilog. In this case inner
; loop trip count is known constant value.		; loop trip count is known constant value.

; Function Attrs: uwtable		; Function Attrs: uwtable
define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !11 {		define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !11 {
		; CHECK-LABEL: @_Z3fooi2(
; CHECK: [[WIDE_LOAD:%.]] = load <4 x i32>, <4 x i32> [[TMP15:%.*]]		; CHECK: [[WIDE_LOAD:%.]] = load <4 x i32>, <4 x i32> [[TMP15:%.*]]
; CHECK: [[TMP18:%.]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.]]		; CHECK: [[TMP18:%.]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.]]
; CHECK: [[WIDE_LOAD10:%.]] = load <4 x i32>, <4 x i32> [[TMP23:%.*]]		; CHECK: [[WIDE_LOAD10:%.]] = load <4 x i32>, <4 x i32> [[TMP23:%.*]]
; CHECK: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]]		; CHECK: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]]
; CHECK: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]]		; CHECK: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]]
;		;
entry:		entry:
br label %for.body		br label %for.body
Show All 28 Lines

; This is negative test. Check that vectorization is not performed for COLD		; This is negative test. Check that vectorization is not performed for COLD
; short trip count loop requiring epilog. Note that outer loop has only 20		; short trip count loop requiring epilog. Note that outer loop has only 20
; iterations and there is no associated profile info.		; iterations and there is no associated profile info.


; Function Attrs: uwtable		; Function Attrs: uwtable
define dso_local void @_Z3fooi3(i32 %M) local_unnamed_addr #0 !prof !11 {		define dso_local void @_Z3fooi3(i32 %M) local_unnamed_addr #0 !prof !11 {
		; CHECK-LABEL: @_Z3fooi3(
		; CHECK: [[SCAL_BODY:for\.body4\.us]]:
; CHECK: [[TMP2:%.]] = load i32, i32 [[ARRAYIDX_US:%.*]]		; CHECK: [[TMP2:%.]] = load i32, i32 [[ARRAYIDX_US:%.*]]
; CHECK: [[MUL_US:%.]] = mul nsw i32 [[TMP2]], [[TMP3:%.]]		; CHECK: [[MUL_US:%.]] = mul nsw i32 [[TMP2]], [[TMP3:%.]]
; CHECK: [[TMP4:%.]] = load i32, i32 [[ARRAYIDX6_US:%.*]]		; CHECK: [[TMP4:%.]] = load i32, i32 [[ARRAYIDX6_US:%.*]]
; CHECK: [[ADD_US:%.*]] = add nsw i32 [[TMP4]], [[MUL_US]]		; CHECK: [[ADD_US:%.*]] = add nsw i32 [[TMP4]], [[MUL_US]]
; CHECK: store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]]		; CHECK: store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]]
		; CHECK: br i1 [[EXITCOND:%.]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.]], label %[[SCAL_BODY]], !prof [[LP26:\!.*]]
;		;
entry:		entry:
%a = alloca [5 x i32], align 16		%a = alloca [5 x i32], align 16
%b = alloca [5 x i32], align 16		%b = alloca [5 x i32], align 16
%0 = bitcast [5 x i32]* %a to i8*		%0 = bitcast [5 x i32]* %a to i8*
call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3		call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3
%1 = bitcast [5 x i32]* %b to i8*		%1 = bitcast [5 x i32]* %b to i8*
call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3		call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1		declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1

attributes #0 = { "use-soft-float"="false" }		attributes #0 = { "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }		attributes #1 = { argmemonly nounwind willreturn }

!llvm.module.flags = !{!0}		!llvm.module.flags = !{!0}
!llvm.ident = !{!1}		!llvm.ident = !{!1}

		; CHECK: [[LP12]] = !{!"branch_weights", i32 0, i32 0}
		; CHECK: [[LP15]] = !{!"branch_weights", i32 999, i32 4995}
		; CHECK: [[LP26]] = !{!"branch_weights", i32 9, i32 45}

!0 = !{i32 1, !"wchar_size", i32 4}		!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"}		!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"}
!2 = !{!3, !3, i64 0}		!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}		!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}		!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C++ TBAA"}		!5 = !{!"Simple C++ TBAA"}
!6 = distinct !{!6, !7}		!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.isvectorized", i32 1}		!7 = !{!"llvm.loop.isvectorized", i32 1}
!8 = distinct !{!8, !9, !7}		!8 = distinct !{!8, !9, !7}
!9 = !{!"llvm.loop.unroll.runtime.disable"}		!9 = !{!"llvm.loop.unroll.runtime.disable"}
!10 = !{!"branch_weights", i32 999, i32 4995}		!10 = !{!"branch_weights", i32 999, i32 4995}
!11 = !{!"function_entry_count", i64 1}		!11 = !{!"function_entry_count", i64 1}
!12 = !{!"branch_weights", i32 1, i32 999}		!12 = !{!"branch_weights", i32 1, i32 999}
!13 = !{!"branch_weights", i32 1000, i32 1}		!13 = !{!"branch_weights", i32 1000, i32 1}
!14 = !{!"branch_weights", i32 9, i32 45}		!14 = !{!"branch_weights", i32 9, i32 45}

llvm/test/Transforms/LoopVectorize/interleave_short_tc.ll

This file was added.

				; Check that we won't interleave by more than "best known" estimated trip count.

				; The loop is expected to be vectorized by 4 and interleaving suppresed due to
				; short trip count which is controled by "tiny-trip-count-interleave-threshold".
				; RUN: opt -passes=loop-vectorize -force-vector-width=4 -vectorizer-min-trip-count=4 -S < %s \| FileCheck %s
				;
				; The loop is expected to be vectorized by 4 and computed interleaving factor is 1.
				; Thus the resulting step is 4.
				; RUN: opt -passes=loop-vectorize -force-vector-width=4 -vectorizer-min-trip-count=4 -tiny-trip-count-interleave-threshold=4 -S < %s \| FileCheck %s

				; The loop is expected to be vectorized by 2 and computed interleaving factor is 2.
				; Thus the resulting step is 4.
				; RUN: opt -passes=loop-vectorize -force-vector-width=2 -vectorizer-min-trip-count=4 -tiny-trip-count-interleave-threshold=4 -S < %s \| FileCheck %s

				; Check that we won't interleave by more than "best known" estimated trip count.

				target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-unknown-linux-gnu"

				@a = dso_local global [5 x i32] zeroinitializer, align 16
				@b = dso_local global [5 x i32] zeroinitializer, align 16

				; Function Attrs: nofree norecurse nounwind uwtable
				define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 {
				; CHECK-LABEL: @_Z3fooi(
				; CHECK: [[VECTOR_BODY:vector\.body]]:
				; CHECK: [[INDEX:%.]] = phi i64 [ 0, [[VECTOR_PH:%.]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
				; CHECK: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
				;
				entry:
				%cmp8 = icmp sgt i32 %M, 0
				br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup

				for.body.preheader: ; preds = %entry
				%wide.trip.count = zext i32 %M to i64
				br label %for.body

				for.cond.cleanup.loopexit: ; preds = %for.body
				br label %for.cond.cleanup

				for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
				ret void

				for.body: ; preds = %for.body, %for.body.preheader
				%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
				%arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 %indvars.iv
				%0 = load i32, i32* %arrayidx, align 4, !tbaa !2
				%1 = trunc i64 %indvars.iv to i32
				%mul = mul nsw i32 %0, %1
				%arrayidx2 = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 %indvars.iv
				%2 = load i32, i32* %arrayidx2, align 4, !tbaa !2
				%add = add nsw i32 %2, %mul
				store i32 %add, i32* %arrayidx2, align 4, !tbaa !2
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
				br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !prof !6
				}

				attributes #0 = { "use-soft-float"="false" }

				!llvm.module.flags = !{!0}
				!llvm.ident = !{!1}

				!0 = !{i32 1, !"wchar_size", i32 4}
				!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"}
				!2 = !{!3, !3, i64 0}
				!3 = !{!"int", !4, i64 0}
				!4 = !{!"omnipotent char", !5, i64 0}
				!5 = !{!"Simple C++ TBAA"}
				!6 = !{!"branch_weights", i32 1, i32 5}

llvm/test/Transforms/LoopVectorize/tripcount.ll

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	for.end: ; preds = %for.body
ret i32 0		ret i32 0
}		}

define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {		define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
; The loop has low invocation count compare to the function invocation count,		; The loop has low invocation count compare to the function invocation count,
; but has a high trip count per invocation. Vectorize it.		; but has a high trip count per invocation. Vectorize it.

; CHECK-LABEL: @foo_low_trip_count3(		; CHECK-LABEL: @foo_low_trip_count3(
; CHECK: vector.body:		; CHECK: [[VECTOR_BODY:vector\.body]]:
		; CHECK: br i1 [[TMP9:%.]], label [[MIDDLE_BLOCK:%.]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]],
		; CHECK: [[FOR_BODY:for\.body]]:
		; CHECK: br i1 [[EXITCOND:%.]], label [[FOR_END_LOOPEXIT:%.]], label %[[FOR_BODY]], !prof [[LP6:\!.*]],
entry:		entry:
br i1 %cond, label %for.preheader, label %for.end, !prof !2		br i1 %cond, label %for.preheader, label %for.end, !prof !2

for.preheader:		for.preheader:
br label %for.body		br label %for.body

for.body: ; preds = %for.body, %entry		for.body: ; preds = %for.body, %entry
%i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]		%i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
▲ Show 20 Lines • Show All 126 Lines • ▼ Show 20 Lines	for.body: ; preds = %for.body, %entry
%inc = add nsw i32 %i.08, 1		%inc = add nsw i32 %i.08, 1
%exitcond = icmp slt i32 %i.08, 1000		%exitcond = icmp slt i32 %i.08, 1000
br i1 %exitcond, label %for.body, label %for.end, !prof !1		br i1 %exitcond, label %for.body, label %for.end, !prof !1

for.end: ; preds = %for.body		for.end: ; preds = %for.body
ret i32 0		ret i32 0
}		}

		; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2500}
		; CHECK: [[LP6]] = !{!"branch_weights", i32 0, i32 0}

!0 = !{!"function_entry_count", i64 100}		!0 = !{!"function_entry_count", i64 100}
!1 = !{!"branch_weights", i32 100, i32 0}		!1 = !{!"branch_weights", i32 100, i32 0}
!2 = !{!"branch_weights", i32 10, i32 90}		!2 = !{!"branch_weights", i32 10, i32 90}
!3 = !{!"branch_weights", i32 10, i32 10000}		!3 = !{!"branch_weights", i32 10, i32 10000}

This is an archive of the discontinued LLVM Phabricator instance.

[LV] Allow vectorization of hot short trip count loops with epilog
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 222118

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll

llvm/test/Transforms/LoopVectorize/check-prof-info.ll

llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll

llvm/test/Transforms/LoopVectorize/interleave_short_tc.ll

llvm/test/Transforms/LoopVectorize/tripcount.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LV] Allow vectorization of hot short trip count loops with epilogAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 222118

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll

llvm/test/Transforms/LoopVectorize/check-prof-info.ll

llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll

llvm/test/Transforms/LoopVectorize/interleave_short_tc.ll

llvm/test/Transforms/LoopVectorize/tripcount.ll

[LV] Allow vectorization of hot short trip count loops with epilog
AbandonedPublic