Diff 45538

llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp

Show All 9 Lines
// stores that can be put together into vector-stores. Next, it attempts to		// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree		// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.		// was found, the SLP vectorizer performs vectorization on the tree.
//		//
// The pass is inspired by the work described in the paper:		// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.		// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/MapVector.h"		#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Optional.h"		#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"		#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"		#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"		#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"		#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/AssumptionCache.h"		#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"		#include "llvm/Analysis/CodeMetrics.h"
		#include "llvm/Analysis/DemandedBits.h"
		#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"		#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"		#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"		#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"		#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"		#include "llvm/Analysis/ValueTracking.h"
		#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DataLayout.h"		#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"		#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"		#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Type.h"		#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"		#include "llvm/IR/Value.h"
#include "llvm/IR/Verifier.h"		#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"
#include "llvm/Analysis/VectorUtils.h"		#include "llvm/Transforms/Vectorize.h"
#include <algorithm>		#include <algorithm>
#include <map>		#include <map>
#include <memory>		#include <memory>

using namespace llvm;		using namespace llvm;

#define SV_NAME "slp-vectorizer"		#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"		#define DEBUG_TYPE "SLP"
▲ Show 20 Lines • Show All 302 Lines • ▼ Show 20 Lines
public:		public:
typedef SmallVector<Value *, 8> ValueList;		typedef SmallVector<Value *, 8> ValueList;
typedef SmallVector<Instruction *, 16> InstrList;		typedef SmallVector<Instruction *, 16> InstrList;
typedef SmallPtrSet<Value *, 16> ValueSet;		typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;		typedef SmallVector<StoreInst *, 8> StoreList;

BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,		BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,
TargetLibraryInfo TLi, AliasAnalysis Aa, LoopInfo *Li,		TargetLibraryInfo TLi, AliasAnalysis Aa, LoopInfo *Li,
DominatorTree Dt, AssumptionCache AC)		DominatorTree Dt, AssumptionCache AC, DemandedBits *DB)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),		: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),		SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
Builder(Se->getContext()) {		Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);		CodeMetrics::collectEphemeralValues(F, AC, EphValues);
		MaxRequiredIntegerTy = nullptr;
}		}

/// \brief Vectorize the tree that starts with the elements in \p VL.		/// \brief Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.		/// Returns the vectorized root.
Value *vectorizeTree();		Value *vectorizeTree();

/// \returns the cost incurred by unwanted spills and fills, caused by		/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.		/// holding live values over call sites.
Show All 15 Lines	void deleteTree() {
MustGather.clear();		MustGather.clear();
ExternalUses.clear();		ExternalUses.clear();
NumLoadsWantToKeepOrder = 0;		NumLoadsWantToKeepOrder = 0;
NumLoadsWantToChangeOrder = 0;		NumLoadsWantToChangeOrder = 0;
for (auto &Iter : BlocksSchedules) {		for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();		BlockScheduling *BS = Iter.second.get();
BS->clear();		BS->clear();
}		}
		MaxRequiredIntegerTy = nullptr;
}		}

/// \returns true if the memory operations A and B are consecutive.		/// \returns true if the memory operations A and B are consecutive.
bool isConsecutiveAccess(Value A, Value B, const DataLayout &DL);		bool isConsecutiveAccess(Value A, Value B, const DataLayout &DL);

/// \brief Perform LICM and CSE on the newly generated gather sequences.		/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();		void optimizeGatherSequence();

/// \returns true if it is beneficial to reverse the vector order.		/// \returns true if it is beneficial to reverse the vector order.
bool shouldReorder() const {		bool shouldReorder() const {
return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;		return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
}		}

/// \return The vector element size in bits to use when vectorizing the		/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of		/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded		/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate		/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.		/// vectorization factors.
unsigned getVectorElementSize(Value *V);		unsigned getVectorElementSize(Value *V);

		/// Compute the maximum width integer type required to represent the result
		/// of a scalar expression, if such a type exists.
		void computeMaxRequiredIntegerTy();

private:		private:
struct TreeEntry;		struct TreeEntry;

/// \returns the cost of the vectorizable entry.		/// \returns the cost of the vectorizable entry.
int getEntryCost(TreeEntry *E);		int getEntryCost(TreeEntry *E);

/// This is the recursive part of buildTree.		/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);		void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
▲ Show 20 Lines • Show All 489 Lines • ▼ Show 20 Lines	#endif
// Analysis and block reference.		// Analysis and block reference.
Function *F;		Function *F;
ScalarEvolution *SE;		ScalarEvolution *SE;
TargetTransformInfo *TTI;		TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;		TargetLibraryInfo *TLI;
AliasAnalysis *AA;		AliasAnalysis *AA;
LoopInfo *LI;		LoopInfo *LI;
DominatorTree *DT;		DominatorTree *DT;
		AssumptionCache *AC;
		DemandedBits *DB;
/// Instruction builder to construct the vectorized tree.		/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;		IRBuilder<> Builder;

		// The maximum width integer type required to represent a scalar expression.
		IntegerType *MaxRequiredIntegerTy;
};		};

#ifndef NDEBUG		#ifndef NDEBUG
raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {		raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
SD.dump(os);		SD.dump(os);
return os;		return os;
}		}
#endif		#endif
▲ Show 20 Lines • Show All 539 Lines • ▼ Show 20 Lines
int BoUpSLP::getEntryCost(TreeEntry *E) {		int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;		ArrayRef<Value*> VL = E->Scalars;

Type *ScalarTy = VL[0]->getType();		Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))		if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();		ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());		VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

		// If we have computed a smaller type for the expression, update VecTy so
		// that the costs will be accurate.
		if (MaxRequiredIntegerTy) {
		auto *IT = dyn_cast<IntegerType>(ScalarTy);
		assert(IT && "Computed smaller type for non-integer value?");
		if (MaxRequiredIntegerTy->getBitWidth() < IT->getBitWidth())
		VecTy = VectorType::get(MaxRequiredIntegerTy, VL.size());
		}

if (E->NeedToGather) {		if (E->NeedToGather) {
if (allConstant(VL))		if (allConstant(VL))
return 0;		return 0;
if (isSplat(VL)) {		if (isSplat(VL)) {
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);		return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}		}
return getGatherCost(E->Scalars);		return getGatherCost(E->Scalars);
}		}
▲ Show 20 Lines • Show All 312 Lines • ▼ Show 20 Lines	if (!ExtractCostCalculated.insert(EU.Scalar).second)
continue;		continue;

// Uses by ephemeral values are free (because the ephemeral value will be		// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be		// removed prior to code generation, and so the extraction will be
// removed as well).		// removed as well).
if (EphValues.count(EU.User))		if (EphValues.count(EU.User))
continue;		continue;

VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);		// If we plan to rewrite the tree in a smaller type, we will need to sign
ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,		// extend the extracted value back to the original type. Here, we account
EU.Lane);		// for the extract and the added cost of the sign extend if needed.
		auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
		if (MaxRequiredIntegerTy) {
		VecTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth);
		ExtractCost += TTI->getCastInstrCost(
		Instruction::SExt, EU.Scalar->getType(), MaxRequiredIntegerTy);
		}
		ExtractCost +=
		TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
}		}

Cost += getSpillCost();		Cost += getSpillCost();

DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");		DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
return Cost + ExtractCost;		return Cost + ExtractCost;
}		}

▲ Show 20 Lines • Show All 738 Lines • ▼ Show 20 Lines
Value *BoUpSLP::vectorizeTree() {		Value *BoUpSLP::vectorizeTree() {

// All blocks must be scheduled before any instructions are inserted.		// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {		for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());		scheduleBlock(BSIter.second.get());
}		}

Builder.SetInsertPoint(&F->getEntryBlock().front());		Builder.SetInsertPoint(&F->getEntryBlock().front());
vectorizeTree(&VectorizableTree[0]);		auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);

		// If the vectorized tree can be rewritten in a smaller type, we truncate the
		// vectorized root. InstCombine will then rewrite the entire expression. We
		// sign extend the extracted values below.
		if (MaxRequiredIntegerTy) {
		BasicBlock::iterator I(cast<Instruction>(VectorRoot));
		Builder.SetInsertPoint(&*++I);
		auto BundleWidth = VectorizableTree[0].Scalars.size();
		auto *SmallerTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth);
		auto *Trunc = Builder.CreateTrunc(VectorRoot, SmallerTy);
		VectorizableTree[0].VectorizedValue = Trunc;
		}

DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");		DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");

// Extract all of the elements with the external uses.		// Extract all of the elements with the external uses.
for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();		for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
it != e; ++it) {		it != e; ++it) {
Value *Scalar = it->Scalar;		Value *Scalar = it->Scalar;
llvm::User *User = it->User;		llvm::User *User = it->User;
Show All 16 Lines	for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
// Generate extracts for out-of-tree users.		// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.		// Find the insertion point for the extractelement lane.
if (isa<Instruction>(Vec)){		if (isa<Instruction>(Vec)){
if (PHINode *PH = dyn_cast<PHINode>(User)) {		if (PHINode *PH = dyn_cast<PHINode>(User)) {
for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {		for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
if (PH->getIncomingValue(i) == Scalar) {		if (PH->getIncomingValue(i) == Scalar) {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());		Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);		Value *Ex = Builder.CreateExtractElement(Vec, Lane);
		if (MaxRequiredIntegerTy)
		Ex = Builder.CreateSExt(Ex, Scalar->getType());
CSEBlocks.insert(PH->getIncomingBlock(i));		CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, Ex);		PH->setOperand(i, Ex);
}		}
}		}
} else {		} else {
Builder.SetInsertPoint(cast<Instruction>(User));		Builder.SetInsertPoint(cast<Instruction>(User));
Value *Ex = Builder.CreateExtractElement(Vec, Lane);		Value *Ex = Builder.CreateExtractElement(Vec, Lane);
		if (MaxRequiredIntegerTy)
		Ex = Builder.CreateSExt(Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());		CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);		User->replaceUsesOfWith(Scalar, Ex);
}		}
} else {		} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());		Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);		Value *Ex = Builder.CreateExtractElement(Vec, Lane);
		if (MaxRequiredIntegerTy)
		Ex = Builder.CreateSExt(Ex, Scalar->getType());
CSEBlocks.insert(&F->getEntryBlock());		CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);		User->replaceUsesOfWith(Scalar, Ex);
}		}

DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");		DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}		}

// For each vectorized value:		// For each vectorized value:
▲ Show 20 Lines • Show All 552 Lines • ▼ Show 20 Lines	while (!Worklist.empty() && !FoundUnknownInst) {
// instruction has a vector type, give up.		// instruction has a vector type, give up.
auto *Ty = I->getType();		auto *Ty = I->getType();
if (isa<VectorType>(Ty))		if (isa<VectorType>(Ty))
FoundUnknownInst = true;		FoundUnknownInst = true;

// If the current instruction is a load, update MaxWidth to reflect the		// If the current instruction is a load, update MaxWidth to reflect the
// width of the loaded value.		// width of the loaded value.
else if (isa<LoadInst>(I))		else if (isa<LoadInst>(I))
MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty));		MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty));

// Otherwise, we need to visit the operands of the instruction. We only		// Otherwise, we need to visit the operands of the instruction. We only
// handle the interesting cases from buildTree here. If an operand is an		// handle the interesting cases from buildTree here. If an operand is an
// instruction we haven't yet visited, we add it to the worklist.		// instruction we haven't yet visited, we add it to the worklist.
else if (isa<PHINode>(I) \|\| isa<CastInst>(I) \|\| isa<GetElementPtrInst>(I) \|\|		else if (isa<PHINode>(I) \|\| isa<CastInst>(I) \|\| isa<GetElementPtrInst>(I) \|\|
isa<CmpInst>(I) \|\| isa<SelectInst>(I) \|\| isa<BinaryOperator>(I)) {		isa<CmpInst>(I) \|\| isa<SelectInst>(I) \|\| isa<BinaryOperator>(I)) {
for (Use &U : I->operands())		for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))		if (auto *J = dyn_cast<Instruction>(U.get()))
Show All 10 Lines	unsigned BoUpSLP::getVectorElementSize(Value *V) {
// gave up for some reason, just return the width of V.		// gave up for some reason, just return the width of V.
if (!MaxWidth \|\| FoundUnknownInst)		if (!MaxWidth \|\| FoundUnknownInst)
return DL.getTypeSizeInBits(V->getType());		return DL.getTypeSizeInBits(V->getType());

// Otherwise, return the maximum width we found.		// Otherwise, return the maximum width we found.
return MaxWidth;		return MaxWidth;
}		}

		void BoUpSLP::computeMaxRequiredIntegerTy() {

		// If there are no external uses, the expression tree must be rooted by a
		// store. We can't demote in-memory values, so there is nothing to do here.
		if (ExternalUses.empty())
		return;

		// If the expression is not rooted by a store, these roots should have
		// external uses. We will rely on InstCombine to rewrite the expression in
		// the narrower type. However, InstCombine only rewrites single-use values.
		// This means that if a tree entry other than a root is used externally, it
		// must have multiple uses and InstCombine will not rewrite it. The code
		// below ensures that only the roots are used externally.
		auto &TreeRoot = VectorizableTree[0].Scalars;
		SmallPtrSet<Value *, 16> ScalarRoots(TreeRoot.begin(), TreeRoot.end());
		for (auto &EU : ExternalUses)
		if (!ScalarRoots.erase(EU.Scalar))
		return;
		if (!ScalarRoots.empty())
		return;

		// The maximum bit width required to represent all the instructions in the
		// tree without loss of precision. It would be safe to truncate the
		// expression to this width.
		auto MaxBitWidth = 8u;

		// We first check if all the bits of the root are demanded. If they're not,
		// we can truncate the root to this narrower type.
		auto *Root = dyn_cast<Instruction>(TreeRoot[0]);
		if (!Root \|\| !isa<IntegerType>(Root->getType()) \|\| !Root->hasOneUse())
		return;
		auto Mask = DB->getDemandedBits(Root);
		if (Mask.countLeadingZeros() > 0)
		MaxBitWidth = Mask.getBitWidth() - Mask.countLeadingZeros();

		// If all the bits of the root are demanded, we can try a little harder to
		// compute a narrower type. This can happen, for example, if the roots are
		// getelementptr indices. InstCombine promotes these indices to the pointer
		// width. Thus, all their bits are technically demanded even though the
		// address computation might be vectorized in a smaller type. We start by
		// looking at each entry in the tree.
		else
		for (auto &Entry : VectorizableTree) {

		// Get a representative value for the vectorizable bundle. All values in
		// Entry.Scalars should be isomorphic.
		auto *Scalar = Entry.Scalars[0];

		// If the scalar is used more than once, InstCombine will not rewrite it,
		// so we should give up.
		if (!Scalar->hasOneUse())
		return;

		// We only compute smaller integer types. If the scalar has a different
		// type, give up.
		auto *IT = dyn_cast<IntegerType>(Scalar->getType());
		if (!IT)
		return;

		// Compute the maximum bit width required to store the scalar. We use
		// ValueTracking to compute the number of high-order bits we can
		// truncate. We then round up to the next power-of-two.
		auto &DL = F->getParent()->getDataLayout();
		auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT);
		auto NumTypeBits = IT->getBitWidth();
		MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
		}

		// Round up to the next power-of-two.
		if (!isPowerOf2_64(MaxBitWidth))
		MaxBitWidth = NextPowerOf2(MaxBitWidth);

		// If the maximum bit width we compute is less than the with of the roots'
		// type, we can proceed with the narrowing. Otherwise, do nothing.
		auto *RootIT = cast<IntegerType>(TreeRoot[0]->getType());
		if (MaxBitWidth > 0 && MaxBitWidth < RootIT->getBitWidth())
		MaxRequiredIntegerTy = IntegerType::get(F->getContext(), MaxBitWidth);
		}

/// The SLPVectorizer Pass.		/// The SLPVectorizer Pass.
struct SLPVectorizer : public FunctionPass {		struct SLPVectorizer : public FunctionPass {
typedef SmallVector<StoreInst *, 8> StoreList;		typedef SmallVector<StoreInst *, 8> StoreList;
typedef MapVector<Value *, StoreList> StoreListMap;		typedef MapVector<Value *, StoreList> StoreListMap;
typedef SmallVector<WeakVH, 8> WeakVHList;		typedef SmallVector<WeakVH, 8> WeakVHList;
typedef MapVector<Value *, WeakVHList> WeakVHListMap;		typedef MapVector<Value *, WeakVHList> WeakVHListMap;

/// Pass identification, replacement for typeid		/// Pass identification, replacement for typeid
static char ID;		static char ID;

explicit SLPVectorizer() : FunctionPass(ID) {		explicit SLPVectorizer() : FunctionPass(ID) {
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());		initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
}		}

ScalarEvolution *SE;		ScalarEvolution *SE;
TargetTransformInfo *TTI;		TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;		TargetLibraryInfo *TLI;
AliasAnalysis *AA;		AliasAnalysis *AA;
LoopInfo *LI;		LoopInfo *LI;
DominatorTree *DT;		DominatorTree *DT;
AssumptionCache *AC;		AssumptionCache *AC;
		DemandedBits *DB;

bool runOnFunction(Function &F) override {		bool runOnFunction(Function &F) override {
if (skipOptnoneFunction(F))		if (skipOptnoneFunction(F))
return false;		return false;

SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();		SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);		TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();		auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI() : nullptr;		TLI = TLIP ? &TLIP->getTLI() : nullptr;
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();		AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();		LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();		DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);		AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
		DB = &getAnalysis<DemandedBits>();

Stores.clear();		Stores.clear();
GEPs.clear();		GEPs.clear();
bool Changed = false;		bool Changed = false;

// If the target claims to have no vector registers don't attempt		// If the target claims to have no vector registers don't attempt
// vectorization.		// vectorization.
if (!TTI->getNumberOfRegisters(true))		if (!TTI->getNumberOfRegisters(true))
Show All 13 Lines	bool runOnFunction(Function &F) override {
// Don't vectorize when the attribute NoImplicitFloat is used.		// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))		if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;		return false;

DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");		DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");

// Use the bottom up slp vectorizer to construct chains that start with		// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.		// store instructions.
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);		BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB);

// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to		// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.		// delete instructions.

// Scan the blocks in the function in post order.		// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {		for (auto BB : post_order(&F.getEntryBlock())) {
collectSeedInstructions(BB);		collectSeedInstructions(BB);

Show All 26 Lines	struct SLPVectorizer : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
FunctionPass::getAnalysisUsage(AU);		FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();		AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();		AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();		AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();		AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();		AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();		AU.addRequired<DominatorTreeWrapperPass>();
		AU.addRequired<DemandedBits>();
AU.addPreserved<LoopInfoWrapperPass>();		AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();		AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();		AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();		AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();		AU.setPreservesCFG();
}		}

private:		private:
▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	for (unsigned i = 0, e = ChainLen; i < e; ++i) {
if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))		if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
continue;		continue;

DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i		DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
<< "\n");		<< "\n");
ArrayRef<Value *> Operands = Chain.slice(i, VF);		ArrayRef<Value *> Operands = Chain.slice(i, VF);

R.buildTree(Operands);		R.buildTree(Operands);
		R.computeMaxRequiredIntegerTy();

int Cost = R.getTreeCost();		int Cost = R.getTreeCost();

DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");		DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < CostThreshold) {		if (Cost < CostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");		DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
R.vectorizeTree();		R.vectorizeTree();

▲ Show 20 Lines • Show All 183 Lines • ▼ Show 20 Lines	for (unsigned i = 0, e = VL.size(); i < e; ++i) {
// TODO: check if we can allow reordering also for other cases than		// TODO: check if we can allow reordering also for other cases than
// tryToVectorizePair()		// tryToVectorizePair()
if (allowReorder && R.shouldReorder()) {		if (allowReorder && R.shouldReorder()) {
assert(Ops.size() == 2);		assert(Ops.size() == 2);
assert(BuildVectorSlice.empty());		assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = { Ops[1], Ops[0] };		Value *ReorderedOps[] = { Ops[1], Ops[0] };
R.buildTree(ReorderedOps, None);		R.buildTree(ReorderedOps, None);
}		}
		R.computeMaxRequiredIntegerTy();
int Cost = R.getTreeCost();		int Cost = R.getTreeCost();

if (Cost < -SLPCostThreshold) {		if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");		DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
Value *VectorizedRoot = R.vectorizeTree();		Value *VectorizedRoot = R.vectorizeTree();

// Reconstruct the build vector by extracting the vectorized root. This		// Reconstruct the build vector by extracting the vectorized root. This
// way we handle the case where some elements of the vector are undefined.		// way we handle the case where some elements of the vector are undefined.
▲ Show 20 Lines • Show All 250 Lines • ▼ Show 20 Lines	bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
IRBuilder<> Builder(ReductionRoot);		IRBuilder<> Builder(ReductionRoot);
FastMathFlags Unsafe;		FastMathFlags Unsafe;
Unsafe.setUnsafeAlgebra();		Unsafe.setUnsafeAlgebra();
Builder.setFastMathFlags(Unsafe);		Builder.setFastMathFlags(Unsafe);
unsigned i = 0;		unsigned i = 0;

for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {		for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);		V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
		V.computeMaxRequiredIntegerTy();

// Estimate cost.		// Estimate cost.
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);		int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
if (Cost >= -SLPCostThreshold)		if (Cost >= -SLPCostThreshold)
break;		break;

DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost		DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");		<< ". (HorRdx)\n");
▲ Show 20 Lines • Show All 518 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll

; RUN: opt -S -slp-vectorizer -dce -instcombine < %s \| FileCheck %s		; RUN: opt -S -slp-vectorizer -dce -instcombine < %s \| FileCheck %s --check-prefix=PROFITABLE
		; RUN: opt -S -slp-vectorizer -slp-threshold=-12 -dce -instcombine < %s \| FileCheck %s --check-prefix=UNPROFITABLE

target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"		target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"		target triple = "aarch64--linux-gnu"

; These tests check that we vectorize the index calculations in the		; These tests check that we vectorize the index calculations in the
; gather-reduce pattern shown below. We check cases having i32 and i64		; gather-reduce pattern shown below. We check cases having i32 and i64
; subtraction.		; subtraction.
;		;
; int gather_reduce_8x16(short a, short b, short *g, int n) {		; int gather_reduce_8x16(short a, short b, short *g, int n) {
; int sum = 0;		; int sum = 0;
; for (int i = 0; i < n ; ++i) {		; for (int i = 0; i < n ; ++i) {
; sum += g[a++ - b[0]]; sum += g[a++ - b[1]];		; sum += g[a++ - b[0]]; sum += g[a++ - b[1]];
; sum += g[a++ - b[2]]; sum += g[a++ - b[3]];		; sum += g[a++ - b[2]]; sum += g[a++ - b[3]];
; sum += g[a++ - b[4]]; sum += g[a++ - b[5]];		; sum += g[a++ - b[4]]; sum += g[a++ - b[5]];
; sum += g[a++ - b[6]]; sum += g[a++ - b[7]];		; sum += g[a++ - b[6]]; sum += g[a++ - b[7]];
; }		; }
; return sum;		; return sum;
; }		; }

; CHECK-LABEL: @gather_reduce_8x16_i32		; PROFITABLE-LABEL: @gather_reduce_8x16_i32
;		;
; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>		; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
; CHECK: zext <8 x i16> [[L]] to <8 x i32>		; PROFITABLE: zext <8 x i16> [[L]] to <8 x i32>
; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>		; PROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]		; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
; CHECK: sext i32 [[X]] to i64		; PROFITABLE: sext i32 [[X]] to i64
;		;
define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {		define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
entry:		entry:
%cmp.99 = icmp sgt i32 %n, 0		%cmp.99 = icmp sgt i32 %n, 0
br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup		br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:		for.body.preheader:
br label %for.body		br label %for.body
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	for.body:
%23 = load i16, i16* %arrayidx64, align 2		%23 = load i16, i16* %arrayidx64, align 2
%conv65 = zext i16 %23 to i32		%conv65 = zext i16 %23 to i32
%add66 = add nsw i32 %add57, %conv65		%add66 = add nsw i32 %add57, %conv65
%inc = add nuw nsw i32 %i.0103, 1		%inc = add nuw nsw i32 %i.0103, 1
%exitcond = icmp eq i32 %inc, %n		%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body		br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}		}

; CHECK-LABEL: @gather_reduce_8x16_i64		; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64
;		;
; CHECK-NOT: load <8 x i16>		; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
		; UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32>
		; UNPROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
		; UNPROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
		; UNPROFITABLE: sext i32 [[X]] to i64
;		;
; FIXME: We are currently unable to vectorize the case with i64 subtraction		; TODO: Although we can now vectorize this case while converting the i64
; because the zero extensions are too expensive. The solution here is to		; subtractions to i32, the cost model currently finds vectorization to be
; convert the i64 subtractions to i32 subtractions during vectorization.		; unprofitable. The cost model is penalizing the sign and zero
; This would then match the case above.		; extensions in the vectorized version, but they are actually free.
;		;
define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {		define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
entry:		entry:
%cmp.99 = icmp sgt i32 %n, 0		%cmp.99 = icmp sgt i32 %n, 0
br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup		br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:		for.body.preheader:
br label %for.body		br label %for.body
▲ Show 20 Lines • Show All 103 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Truncate expressions to minimum required bit width
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 45538

llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Truncate expressions to minimum required bit widthClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 45538

llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll

[SLP] Truncate expressions to minimum required bit width
ClosedPublic