Diff 44212

lib/Transforms/Vectorize/SLPVectorizer.cpp

Show All 9 Lines
// stores that can be put together into vector-stores. Next, it attempts to		// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree		// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.		// was found, the SLP vectorizer performs vectorization on the tree.
//		//
// The pass is inspired by the work described in the paper:		// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.		// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/MapVector.h"		#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Optional.h"		#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"		#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"		#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"		#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"		#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/AssumptionCache.h"		#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"		#include "llvm/Analysis/CodeMetrics.h"
		#include "llvm/Analysis/DemandedBits.h"
		#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"		#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"		#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"		#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"		#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"		#include "llvm/Analysis/ValueTracking.h"
		#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DataLayout.h"		#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"		#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"		#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"		#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Type.h"		#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"		#include "llvm/IR/Value.h"
#include "llvm/IR/Verifier.h"		#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"
#include "llvm/Analysis/VectorUtils.h"		#include "llvm/Transforms/Vectorize.h"
#include <algorithm>		#include <algorithm>
#include <map>		#include <map>
#include <memory>		#include <memory>

using namespace llvm;		using namespace llvm;

#define SV_NAME "slp-vectorizer"		#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"		#define DEBUG_TYPE "SLP"
▲ Show 20 Lines • Show All 302 Lines • ▼ Show 20 Lines
public:		public:
typedef SmallVector<Value *, 8> ValueList;		typedef SmallVector<Value *, 8> ValueList;
typedef SmallVector<Instruction *, 16> InstrList;		typedef SmallVector<Instruction *, 16> InstrList;
typedef SmallPtrSet<Value *, 16> ValueSet;		typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;		typedef SmallVector<StoreInst *, 8> StoreList;

BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,		BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,
TargetLibraryInfo TLi, AliasAnalysis Aa, LoopInfo *Li,		TargetLibraryInfo TLi, AliasAnalysis Aa, LoopInfo *Li,
DominatorTree Dt, AssumptionCache AC)		DominatorTree Dt, AssumptionCache AC, DemandedBits *DB)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),		: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),		SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
Builder(Se->getContext()) {		Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);		CodeMetrics::collectEphemeralValues(F, AC, EphValues);
		MaxRequiredIntegerTy = nullptr;
}		}

/// \brief Vectorize the tree that starts with the elements in \p VL.		/// \brief Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.		/// Returns the vectorized root.
Value *vectorizeTree();		Value *vectorizeTree();

/// \returns the cost incurred by unwanted spills and fills, caused by		/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.		/// holding live values over call sites.
Show All 15 Lines	void deleteTree() {
MustGather.clear();		MustGather.clear();
ExternalUses.clear();		ExternalUses.clear();
NumLoadsWantToKeepOrder = 0;		NumLoadsWantToKeepOrder = 0;
NumLoadsWantToChangeOrder = 0;		NumLoadsWantToChangeOrder = 0;
for (auto &Iter : BlocksSchedules) {		for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();		BlockScheduling *BS = Iter.second.get();
BS->clear();		BS->clear();
}		}
		MaxRequiredIntegerTy = nullptr;
}		}

/// \returns true if the memory operations A and B are consecutive.		/// \returns true if the memory operations A and B are consecutive.
bool isConsecutiveAccess(Value A, Value B, const DataLayout &DL);		bool isConsecutiveAccess(Value A, Value B, const DataLayout &DL);

/// \brief Perform LICM and CSE on the newly generated gather sequences.		/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();		void optimizeGatherSequence();

/// \returns true if it is beneficial to reverse the vector order.		/// \returns true if it is beneficial to reverse the vector order.
bool shouldReorder() const {		bool shouldReorder() const {
return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;		return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
}		}

/// \return The vector element size in bits to use when vectorizing the		/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of		/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded		/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate		/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.		/// vectorization factors.
unsigned getVectorElementSize(Value *V);		unsigned getVectorElementSize(Value *V);

		/// Compute the maximum width integer type required to represent the result
		/// of a scalar expression, if such a type exists.
		void computeMaxRequiredIntegerTy();

private:		private:
struct TreeEntry;		struct TreeEntry;

/// \returns the cost of the vectorizable entry.		/// \returns the cost of the vectorizable entry.
int getEntryCost(TreeEntry *E);		int getEntryCost(TreeEntry *E);

/// This is the recursive part of buildTree.		/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);		void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
▲ Show 20 Lines • Show All 489 Lines • ▼ Show 20 Lines	#endif
// Analysis and block reference.		// Analysis and block reference.
Function *F;		Function *F;
ScalarEvolution *SE;		ScalarEvolution *SE;
TargetTransformInfo *TTI;		TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;		TargetLibraryInfo *TLI;
AliasAnalysis *AA;		AliasAnalysis *AA;
LoopInfo *LI;		LoopInfo *LI;
DominatorTree *DT;		DominatorTree *DT;
		AssumptionCache *AC;
		DemandedBits *DB;
/// Instruction builder to construct the vectorized tree.		/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;		IRBuilder<> Builder;

		// The maximum width integer type required to represent a scalar expression.
		IntegerType *MaxRequiredIntegerTy;
};		};

#ifndef NDEBUG		#ifndef NDEBUG
raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {		raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
SD.dump(os);		SD.dump(os);
return os;		return os;
}		}
#endif		#endif
▲ Show 20 Lines • Show All 539 Lines • ▼ Show 20 Lines
int BoUpSLP::getEntryCost(TreeEntry *E) {		int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;		ArrayRef<Value*> VL = E->Scalars;

Type *ScalarTy = VL[0]->getType();		Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))		if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();		ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());		VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

		// If we have computed a smaller type for the expression, update VecTy so
		// that the costs will be accurate.
		if (MaxRequiredIntegerTy) {
		auto *IT = dyn_cast<IntegerType>(ScalarTy);
		assert(IT && "Computed smaller type for non-integer value?");
		if (MaxRequiredIntegerTy->getBitWidth() < IT->getBitWidth())
		VecTy = VectorType::get(MaxRequiredIntegerTy, VL.size());
		}

if (E->NeedToGather) {		if (E->NeedToGather) {
if (allConstant(VL))		if (allConstant(VL))
return 0;		return 0;
if (isSplat(VL)) {		if (isSplat(VL)) {
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);		return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}		}
return getGatherCost(E->Scalars);		return getGatherCost(E->Scalars);
}		}
▲ Show 20 Lines • Show All 313 Lines • ▼ Show 20 Lines	if (!ExtractCostCalculated.insert(I->Scalar).second)
continue;		continue;

// Uses by ephemeral values are free (because the ephemeral value will be		// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be		// removed prior to code generation, and so the extraction will be
// removed as well).		// removed as well).
if (EphValues.count(I->User))		if (EphValues.count(I->User))
continue;		continue;

VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);		// If we plan to rewrite the tree in a smaller type, we will need to sign
ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,		// extend the extracted value back to the original type. Here, we account
I->Lane);		// for the extract and the added cost of the sign extend if needed.
		auto *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
		if (MaxRequiredIntegerTy) {
		VecTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth);
		ExtractCost += TTI->getCastInstrCost(
		Instruction::SExt, I->Scalar->getType(), MaxRequiredIntegerTy);
		}
		ExtractCost +=
		TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I->Lane);
}		}

Cost += getSpillCost();		Cost += getSpillCost();

DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");		DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
return Cost + ExtractCost;		return Cost + ExtractCost;
}		}

▲ Show 20 Lines • Show All 738 Lines • ▼ Show 20 Lines
Value *BoUpSLP::vectorizeTree() {		Value *BoUpSLP::vectorizeTree() {

// All blocks must be scheduled before any instructions are inserted.		// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {		for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());		scheduleBlock(BSIter.second.get());
}		}

Builder.SetInsertPoint(&F->getEntryBlock().front());		Builder.SetInsertPoint(&F->getEntryBlock().front());
vectorizeTree(&VectorizableTree[0]);		auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);

		// If the vectorized tree can be rewritten in a smaller type, we truncate the
		// vectorized root. InstCombine will then rewrite the entire expression. We
		// sign extend the extracted values below.
		if (MaxRequiredIntegerTy) {
		BasicBlock::iterator I(cast<Instruction>(VectorRoot));
		Builder.SetInsertPoint(&*++I);
		auto BundleWidth = VectorizableTree[0].Scalars.size();
		auto *SmallerTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth);
		auto *Trunc = Builder.CreateTrunc(VectorRoot, SmallerTy);
		VectorizableTree[0].VectorizedValue = Trunc;
		}

DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");		DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");

// Extract all of the elements with the external uses.		// Extract all of the elements with the external uses.
for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();		for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
it != e; ++it) {		it != e; ++it) {
Value *Scalar = it->Scalar;		Value *Scalar = it->Scalar;
llvm::User *User = it->User;		llvm::User *User = it->User;
Show All 16 Lines	for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
// Generate extracts for out-of-tree users.		// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.		// Find the insertion point for the extractelement lane.
if (isa<Instruction>(Vec)){		if (isa<Instruction>(Vec)){
if (PHINode *PH = dyn_cast<PHINode>(User)) {		if (PHINode *PH = dyn_cast<PHINode>(User)) {
for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {		for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
if (PH->getIncomingValue(i) == Scalar) {		if (PH->getIncomingValue(i) == Scalar) {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());		Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);		Value *Ex = Builder.CreateExtractElement(Vec, Lane);
		if (MaxRequiredIntegerTy)
		Ex = Builder.CreateSExt(Ex, Scalar->getType());
CSEBlocks.insert(PH->getIncomingBlock(i));		CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, Ex);		PH->setOperand(i, Ex);
}		}
}		}
} else {		} else {
Builder.SetInsertPoint(cast<Instruction>(User));		Builder.SetInsertPoint(cast<Instruction>(User));
Value *Ex = Builder.CreateExtractElement(Vec, Lane);		Value *Ex = Builder.CreateExtractElement(Vec, Lane);
		if (MaxRequiredIntegerTy)
		Ex = Builder.CreateSExt(Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());		CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);		User->replaceUsesOfWith(Scalar, Ex);
}		}
} else {		} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());		Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);		Value *Ex = Builder.CreateExtractElement(Vec, Lane);
		if (MaxRequiredIntegerTy)
		Ex = Builder.CreateSExt(Ex, Scalar->getType());
CSEBlocks.insert(&F->getEntryBlock());		CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);		User->replaceUsesOfWith(Scalar, Ex);
}		}

DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");		DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}		}

// For each vectorized value:		// For each vectorized value:
▲ Show 20 Lines • Show All 552 Lines • ▼ Show 20 Lines	while (!Worklist.empty() && !FoundUnknownInst) {
// instruction has a vector type, give up.		// instruction has a vector type, give up.
auto *Ty = I->getType();		auto *Ty = I->getType();
if (isa<VectorType>(Ty))		if (isa<VectorType>(Ty))
FoundUnknownInst = true;		FoundUnknownInst = true;

// If the current instruction is a load, update MaxWidth to reflect the		// If the current instruction is a load, update MaxWidth to reflect the
// width of the loaded value.		// width of the loaded value.
else if (isa<LoadInst>(I))		else if (isa<LoadInst>(I))
MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty));		MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty));

// Otherwise, we need to visit the operands of the instruction. We only		// Otherwise, we need to visit the operands of the instruction. We only
// handle the interesting cases from buildTree here. If an operand is an		// handle the interesting cases from buildTree here. If an operand is an
// instruction we haven't yet visited, we add it to the worklist.		// instruction we haven't yet visited, we add it to the worklist.
else if (isa<PHINode>(I) \|\| isa<CastInst>(I) \|\| isa<GetElementPtrInst>(I) \|\|		else if (isa<PHINode>(I) \|\| isa<CastInst>(I) \|\| isa<GetElementPtrInst>(I) \|\|
isa<CmpInst>(I) \|\| isa<SelectInst>(I) \|\| isa<BinaryOperator>(I)) {		isa<CmpInst>(I) \|\| isa<SelectInst>(I) \|\| isa<BinaryOperator>(I)) {
for (Use &U : I->operands())		for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))		if (auto *J = dyn_cast<Instruction>(U.get()))
Show All 10 Lines	unsigned BoUpSLP::getVectorElementSize(Value *V) {
// gave up for some reason, just return the width of V.		// gave up for some reason, just return the width of V.
if (!MaxWidth \|\| FoundUnknownInst)		if (!MaxWidth \|\| FoundUnknownInst)
return DL.getTypeSizeInBits(V->getType());		return DL.getTypeSizeInBits(V->getType());

// Otherwise, return the maximum width we found.		// Otherwise, return the maximum width we found.
return MaxWidth;		return MaxWidth;
}		}

		void BoUpSLP::computeMaxRequiredIntegerTy() {

		// If there are no external uses, the expression tree must be rooted by a
		jmolloyUnsubmitted Not Done Reply Inline Actions I'm not quite sure I understand why this is the case. jmolloy: I'm not quite sure I understand why this is the case.
		mssimpsoAuthorUnsubmitted Not Done Reply Inline Actions This work is a little more limited than what you've done in the loop vectorizer. Because we're working with expression trees, I thought it would be simpler to only truncate the roots of single-use chains. If the tree has no external users, it should be rooted by stores, which obviously can't be truncated. So we give up in that case. I've tried to explain my reasoning a little better in the comments, but I haven't yet thought much about the more general case. In particular, the current patch doesn't do anything useful for the single-use chains ending in stores that have unnecessary extends and truncations in them that prevent us from getting past the cost model. Even though the extends and truncations in those chains may be removed by InstCombine after vectorization, they will currently still be counted by the model. I was thinking of adding this functionality as a follow-on, but I can add it to this patch if you like. What do you think? mssimpso: This work is a little more limited than what you've done in the loop vectorizer. Because we're…
		jmolloyUnsubmitted Not Done Reply Inline Actions In particular, the current patch doesn't do anything useful for the single-use chains ending in stores that have unnecessary extends and truncations OK, that's what I was missing. I was thinking of adding this functionality as a follow-on, but I can add it to this patch if you like. What do you think? I'm happy with it as a followon. I just didn't understand. Thanks! jmolloy: > In particular, the current patch doesn't do anything useful for the single-use chains ending…
		// store. We can't demote in-memory values, so there is nothing to do here.
		if (ExternalUses.empty())
		return;

		// If the expression is not rooted by a store, these roots should have
		// external uses. We will rely on InstCombine to rewrite the expression in
		// the narrower type. However, InstCombine only rewrites single-use values.
		// This means that if a tree entry other than a root is used externally, it
		// must have multiple uses and InstCombine will not rewrite it. The code
		// below ensures that only the roots are used externally.
		auto &TreeRoot = VectorizableTree[0].Scalars;
		SmallPtrSet<Value *, 16> ScalarRoots(TreeRoot.begin(), TreeRoot.end());
		for (auto &EU : ExternalUses)
		if (!ScalarRoots.erase(EU.Scalar))
		return;
		if (!ScalarRoots.empty())
		return;

		// The maximum bit width required to represent all the instructions in the
		// tree without loss of precision. It would be safe to truncate the
		// expression to this width.
		auto MaxBitWidth = 8u;

		// We first check if all the bits of the root are demanded. If they're not,
		// we can truncate the root to this narrower type.
		auto *Root = dyn_cast<Instruction>(TreeRoot[0]);
		if (!Root \|\| !isa<IntegerType>(Root->getType()) \|\| !Root->hasOneUse())
		return;
		auto Mask = DB->getDemandedBits(Root);
		if (Mask.countLeadingZeros() > 0)
		MaxBitWidth = Mask.getBitWidth() - Mask.countLeadingZeros();

		// If all the bits of the root are demanded, we can try a little harder to
		// compute a narrower type. This can happen, for example, if the roots are
		// getelementptr indices. InstCombine promotes these indices to the pointer
		// width. Thus, all their bits are technically demanded even though the
		// address computation might be vectorized in a smaller type. We start by
		// looking at each entry in the tree.
		else
		for (auto &Entry : VectorizableTree) {

		// Get a representative value for the vectorizable bundle. All values in
		// Entry.Scalars should be isomorphic.
		jmolloyUnsubmitted Done Reply Inline Actions It feels like you should be able to benefit from DemandedBits here. That's what we do in the LoopVectorizer and it can trigger in many more situations than just ValueTracking. jmolloy: It feels like you should be able to benefit from DemandedBits here. That's what we do in the…
		mssimpsoAuthorUnsubmitted Not Done Reply Inline Actions My first inclination was to try and reuse the DemandedBits analysis you added to VectorUtils. However, demanded bits alone didn't help the case I was most interested in: the index computation of GEPs, which is promoted to 64 bits, all of them being demanded. Since all the bits are demanded by the GEP, we have to look at the number of leading sign bits instead. But I've updated the patch to also consider the demanded bits of the roots since they are all we truncate. mssimpso: My first inclination was to try and reuse the DemandedBits analysis you added to VectorUtils.
		jmolloyUnsubmitted Not Done Reply Inline Actions Ah I see. Yes, I can see that both approaches together would probably get you the best result. jmolloy: Ah I see. Yes, I can see that both approaches together would probably get you the best result.
		auto *Scalar = Entry.Scalars[0];

		// If the scalar is used more than once, InstCombine will not rewrite it,
		// so we should give up.
		if (!Scalar->hasOneUse())
		return;

		// We only compute smaller integer types. If the scalar has a different
		// type, give up.
		auto *IT = dyn_cast<IntegerType>(Scalar->getType());
		if (!IT)
		return;

		// Compute the maximum bit width required to store the scalar. We use
		// ValueTracking to compute the number of high-order bits we can
		// truncate. We then round up to the next power-of-two.
		auto &DL = F->getParent()->getDataLayout();
		auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT);
		auto NumTypeBits = IT->getBitWidth();
		MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
		}

		// Round up to the next power-of-two.
		if (!isPowerOf2_64(MaxBitWidth))
		MaxBitWidth = NextPowerOf2(MaxBitWidth);

		// If the maximum bit width we compute is less than the with of the roots'
		// type, we can proceed with the narrowing. Otherwise, do nothing.
		auto *RootIT = cast<IntegerType>(TreeRoot[0]->getType());
		if (MaxBitWidth > 0 && MaxBitWidth < RootIT->getBitWidth())
		MaxRequiredIntegerTy = IntegerType::get(F->getContext(), MaxBitWidth);
		}

/// The SLPVectorizer Pass.		/// The SLPVectorizer Pass.
struct SLPVectorizer : public FunctionPass {		struct SLPVectorizer : public FunctionPass {
typedef SmallVector<StoreInst *, 8> StoreList;		typedef SmallVector<StoreInst *, 8> StoreList;
typedef MapVector<Value *, StoreList> StoreListMap;		typedef MapVector<Value *, StoreList> StoreListMap;
typedef SmallVector<WeakVH, 8> WeakVHList;		typedef SmallVector<WeakVH, 8> WeakVHList;
typedef MapVector<Value *, WeakVHList> WeakVHListMap;		typedef MapVector<Value *, WeakVHList> WeakVHListMap;

/// Pass identification, replacement for typeid		/// Pass identification, replacement for typeid
static char ID;		static char ID;

explicit SLPVectorizer() : FunctionPass(ID) {		explicit SLPVectorizer() : FunctionPass(ID) {
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());		initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
}		}

ScalarEvolution *SE;		ScalarEvolution *SE;
TargetTransformInfo *TTI;		TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;		TargetLibraryInfo *TLI;
AliasAnalysis *AA;		AliasAnalysis *AA;
LoopInfo *LI;		LoopInfo *LI;
DominatorTree *DT;		DominatorTree *DT;
AssumptionCache *AC;		AssumptionCache *AC;
		DemandedBits *DB;

bool runOnFunction(Function &F) override {		bool runOnFunction(Function &F) override {
if (skipOptnoneFunction(F))		if (skipOptnoneFunction(F))
return false;		return false;

SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();		SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);		TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();		auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI() : nullptr;		TLI = TLIP ? &TLIP->getTLI() : nullptr;
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();		AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();		LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();		DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);		AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
		DB = &getAnalysis<DemandedBits>();

Stores.clear();		Stores.clear();
GEPs.clear();		GEPs.clear();
bool Changed = false;		bool Changed = false;

// If the target claims to have no vector registers don't attempt		// If the target claims to have no vector registers don't attempt
// vectorization.		// vectorization.
if (!TTI->getNumberOfRegisters(true))		if (!TTI->getNumberOfRegisters(true))
Show All 13 Lines	bool runOnFunction(Function &F) override {
// Don't vectorize when the attribute NoImplicitFloat is used.		// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))		if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;		return false;

DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");		DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");

// Use the bottom up slp vectorizer to construct chains that start with		// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.		// store instructions.
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);		BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB);

// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to		// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.		// delete instructions.

// Scan the blocks in the function in post order.		// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {		for (auto BB : post_order(&F.getEntryBlock())) {
collectSeedInstructions(BB);		collectSeedInstructions(BB);

Show All 26 Lines	struct SLPVectorizer : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
FunctionPass::getAnalysisUsage(AU);		FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();		AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();		AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();		AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();		AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();		AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();		AU.addRequired<DominatorTreeWrapperPass>();
		AU.addRequired<DemandedBits>();
AU.addPreserved<LoopInfoWrapperPass>();		AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();		AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();		AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();		AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();		AU.setPreservesCFG();
}		}

private:		private:
▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	for (unsigned i = 0, e = ChainLen; i < e; ++i) {
if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))		if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
continue;		continue;

DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i		DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
<< "\n");		<< "\n");
ArrayRef<Value *> Operands = Chain.slice(i, VF);		ArrayRef<Value *> Operands = Chain.slice(i, VF);

R.buildTree(Operands);		R.buildTree(Operands);
		R.computeMaxRequiredIntegerTy();

int Cost = R.getTreeCost();		int Cost = R.getTreeCost();

DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");		DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < CostThreshold) {		if (Cost < CostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");		DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
R.vectorizeTree();		R.vectorizeTree();

▲ Show 20 Lines • Show All 183 Lines • ▼ Show 20 Lines	for (unsigned i = 0, e = VL.size(); i < e; ++i) {
// TODO: check if we can allow reordering also for other cases than		// TODO: check if we can allow reordering also for other cases than
// tryToVectorizePair()		// tryToVectorizePair()
if (allowReorder && R.shouldReorder()) {		if (allowReorder && R.shouldReorder()) {
assert(Ops.size() == 2);		assert(Ops.size() == 2);
assert(BuildVectorSlice.empty());		assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = { Ops[1], Ops[0] };		Value *ReorderedOps[] = { Ops[1], Ops[0] };
R.buildTree(ReorderedOps, None);		R.buildTree(ReorderedOps, None);
}		}
		R.computeMaxRequiredIntegerTy();
int Cost = R.getTreeCost();		int Cost = R.getTreeCost();

if (Cost < -SLPCostThreshold) {		if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");		DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
Value *VectorizedRoot = R.vectorizeTree();		Value *VectorizedRoot = R.vectorizeTree();

// Reconstruct the build vector by extracting the vectorized root. This		// Reconstruct the build vector by extracting the vectorized root. This
// way we handle the case where some elements of the vector are undefined.		// way we handle the case where some elements of the vector are undefined.
▲ Show 20 Lines • Show All 250 Lines • ▼ Show 20 Lines	bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
IRBuilder<> Builder(ReductionRoot);		IRBuilder<> Builder(ReductionRoot);
FastMathFlags Unsafe;		FastMathFlags Unsafe;
Unsafe.setUnsafeAlgebra();		Unsafe.setUnsafeAlgebra();
Builder.SetFastMathFlags(Unsafe);		Builder.SetFastMathFlags(Unsafe);
unsigned i = 0;		unsigned i = 0;

for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {		for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);		V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
		V.computeMaxRequiredIntegerTy();

// Estimate cost.		// Estimate cost.
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);		int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
if (Cost >= -SLPCostThreshold)		if (Cost >= -SLPCostThreshold)
break;		break;

DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost		DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");		<< ". (HorRdx)\n");
▲ Show 20 Lines • Show All 525 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll

	; RUN: opt -S -slp-vectorizer -dce -instcombine < %s \| FileCheck %s			; RUN: opt -S -slp-vectorizer -dce -instcombine < %s \| FileCheck %s --check-prefix=PROFITABLE
	jmolloyUnsubmitted Done Reply Inline Actions I'm not sure about this: Why are tests being removed? Unless we're fundamentally regressing performance, I think new tests should be added instead of existing tests modified to check new behaviour. jmolloy: I'm not sure about this: Why are tests being removed? Unless we're fundamentally regressing…
	mssimpsoAuthorUnsubmitted Not Done Reply Inline Actions Sure, I'm happy to add the test back. mssimpso: Sure, I'm happy to add the test back.
				; RUN: opt -S -slp-vectorizer -slp-threshold=-12 -dce -instcombine < %s \| FileCheck %s --check-prefix=UNPROFITABLE

	target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"			target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
	target triple = "aarch64--linux-gnu"			target triple = "aarch64--linux-gnu"

	; These tests check that we vectorize the index calculations in the			; These tests check that we vectorize the index calculations in the
	; gather-reduce pattern shown below. We check cases having i32 and i64			; gather-reduce pattern shown below. We check cases having i32 and i64
	; subtraction.			; subtraction.
	;			;
	; int gather_reduce_8x16(short a, short b, short *g, int n) {			; int gather_reduce_8x16(short a, short b, short *g, int n) {
	; int sum = 0;			; int sum = 0;
	; for (int i = 0; i < n ; ++i) {			; for (int i = 0; i < n ; ++i) {
	; sum += g[a++ - b++]; sum += g[a++ - b++];			; sum += g[a++ - b[0]]; sum += g[a++ - b[4]];
	; sum += g[a++ - b++]; sum += g[a++ - b++];			; sum += g[a++ - b[1]]; sum += g[a++ - b[5]];
	; sum += g[a++ - b++]; sum += g[a++ - b++];			; sum += g[a++ - b[2]]; sum += g[a++ - b[6]];
	; sum += g[a++ - b++]; sum += g[a++ - b++];			; sum += g[a++ - b[3]]; sum += g[a++ - b[7]];
	; }			; }
	; return sum;			; return sum;
	; }			; }

	; CHECK-LABEL: @gather_reduce_8x16_i32			; PROFITABLE-LABEL: @gather_reduce_8x16_i32
	;			;
	; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>			; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
	; CHECK: zext <8 x i16> [[L]] to <8 x i32>			; PROFITABLE: zext <8 x i16> [[L]] to <8 x i32>
	; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>			; PROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
	; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]			; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
	; CHECK: sext i32 [[X]] to i64			; PROFITABLE: sext i32 [[X]] to i64
	;			;
	define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {			define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
	entry:			entry:
	%cmp.99 = icmp sgt i32 %n, 0			%cmp.99 = icmp sgt i32 %n, 0
	br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup			br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup

	for.body.preheader:			for.body.preheader:
	br label %for.body			br label %for.body

	for.cond.cleanup.loopexit:			for.cond.cleanup.loopexit:
	br label %for.cond.cleanup			br label %for.cond.cleanup

	for.cond.cleanup:			for.cond.cleanup:
	%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]			%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
	ret i32 %sum.0.lcssa			ret i32 %sum.0.lcssa

	for.body:			for.body:
	%i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]			%i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
	%sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]			%sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
	%a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]			%a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
	%b.addr.0100 = phi i16* [ %incdec.ptr60, %for.body ], [ %b, %for.body.preheader ]
	%incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1			%incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
	%0 = load i16, i16* %a.addr.0101, align 2			%0 = load i16, i16* %a.addr.0101, align 2
	%conv = zext i16 %0 to i32			%conv = zext i16 %0 to i32
	%incdec.ptr1 = getelementptr inbounds i16, i16* %b.addr.0100, i64 1			%incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
	%1 = load i16, i16* %b.addr.0100, align 2			%1 = load i16, i16* %b, align 2
	%conv2 = zext i16 %1 to i32			%conv2 = zext i16 %1 to i32
	%sub = sub nsw i32 %conv, %conv2			%sub = sub nsw i32 %conv, %conv2
	%arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub			%arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
	%2 = load i16, i16* %arrayidx, align 2			%2 = load i16, i16* %arrayidx, align 2
	%conv3 = zext i16 %2 to i32			%conv3 = zext i16 %2 to i32
	%add = add nsw i32 %conv3, %sum.0102			%add = add nsw i32 %conv3, %sum.0102
	%incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2			%incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
	%3 = load i16, i16* %incdec.ptr, align 2			%3 = load i16, i16* %incdec.ptr, align 2
	%conv5 = zext i16 %3 to i32			%conv5 = zext i16 %3 to i32
	%incdec.ptr6 = getelementptr inbounds i16, i16* %b.addr.0100, i64 2			%incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
	%4 = load i16, i16* %incdec.ptr1, align 2			%4 = load i16, i16* %incdec.ptr1, align 2
	%conv7 = zext i16 %4 to i32			%conv7 = zext i16 %4 to i32
	%sub8 = sub nsw i32 %conv5, %conv7			%sub8 = sub nsw i32 %conv5, %conv7
	%arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8			%arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
	%5 = load i16, i16* %arrayidx10, align 2			%5 = load i16, i16* %arrayidx10, align 2
	%conv11 = zext i16 %5 to i32			%conv11 = zext i16 %5 to i32
	%add12 = add nsw i32 %add, %conv11			%add12 = add nsw i32 %add, %conv11
	%incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3			%incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
	%6 = load i16, i16* %incdec.ptr4, align 2			%6 = load i16, i16* %incdec.ptr4, align 2
	%conv14 = zext i16 %6 to i32			%conv14 = zext i16 %6 to i32
	%incdec.ptr15 = getelementptr inbounds i16, i16* %b.addr.0100, i64 3			%incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
	%7 = load i16, i16* %incdec.ptr6, align 2			%7 = load i16, i16* %incdec.ptr6, align 2
	%conv16 = zext i16 %7 to i32			%conv16 = zext i16 %7 to i32
	%sub17 = sub nsw i32 %conv14, %conv16			%sub17 = sub nsw i32 %conv14, %conv16
	%arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17			%arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
	%8 = load i16, i16* %arrayidx19, align 2			%8 = load i16, i16* %arrayidx19, align 2
	%conv20 = zext i16 %8 to i32			%conv20 = zext i16 %8 to i32
	%add21 = add nsw i32 %add12, %conv20			%add21 = add nsw i32 %add12, %conv20
	%incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4			%incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
	%9 = load i16, i16* %incdec.ptr13, align 2			%9 = load i16, i16* %incdec.ptr13, align 2
	%conv23 = zext i16 %9 to i32			%conv23 = zext i16 %9 to i32
	%incdec.ptr24 = getelementptr inbounds i16, i16* %b.addr.0100, i64 4			%incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
	%10 = load i16, i16* %incdec.ptr15, align 2			%10 = load i16, i16* %incdec.ptr15, align 2
	%conv25 = zext i16 %10 to i32			%conv25 = zext i16 %10 to i32
	%sub26 = sub nsw i32 %conv23, %conv25			%sub26 = sub nsw i32 %conv23, %conv25
	%arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26			%arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
	%11 = load i16, i16* %arrayidx28, align 2			%11 = load i16, i16* %arrayidx28, align 2
	%conv29 = zext i16 %11 to i32			%conv29 = zext i16 %11 to i32
	%add30 = add nsw i32 %add21, %conv29			%add30 = add nsw i32 %add21, %conv29
	%incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5			%incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
	%12 = load i16, i16* %incdec.ptr22, align 2			%12 = load i16, i16* %incdec.ptr22, align 2
	%conv32 = zext i16 %12 to i32			%conv32 = zext i16 %12 to i32
	%incdec.ptr33 = getelementptr inbounds i16, i16* %b.addr.0100, i64 5			%incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
	%13 = load i16, i16* %incdec.ptr24, align 2			%13 = load i16, i16* %incdec.ptr24, align 2
	%conv34 = zext i16 %13 to i32			%conv34 = zext i16 %13 to i32
	%sub35 = sub nsw i32 %conv32, %conv34			%sub35 = sub nsw i32 %conv32, %conv34
	%arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35			%arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
	%14 = load i16, i16* %arrayidx37, align 2			%14 = load i16, i16* %arrayidx37, align 2
	%conv38 = zext i16 %14 to i32			%conv38 = zext i16 %14 to i32
	%add39 = add nsw i32 %add30, %conv38			%add39 = add nsw i32 %add30, %conv38
	%incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6			%incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
	%15 = load i16, i16* %incdec.ptr31, align 2			%15 = load i16, i16* %incdec.ptr31, align 2
	%conv41 = zext i16 %15 to i32			%conv41 = zext i16 %15 to i32
	%incdec.ptr42 = getelementptr inbounds i16, i16* %b.addr.0100, i64 6			%incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
	%16 = load i16, i16* %incdec.ptr33, align 2			%16 = load i16, i16* %incdec.ptr33, align 2
	%conv43 = zext i16 %16 to i32			%conv43 = zext i16 %16 to i32
	%sub44 = sub nsw i32 %conv41, %conv43			%sub44 = sub nsw i32 %conv41, %conv43
	%arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44			%arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
	%17 = load i16, i16* %arrayidx46, align 2			%17 = load i16, i16* %arrayidx46, align 2
	%conv47 = zext i16 %17 to i32			%conv47 = zext i16 %17 to i32
	%add48 = add nsw i32 %add39, %conv47			%add48 = add nsw i32 %add39, %conv47
	%incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7			%incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
	%18 = load i16, i16* %incdec.ptr40, align 2			%18 = load i16, i16* %incdec.ptr40, align 2
	%conv50 = zext i16 %18 to i32			%conv50 = zext i16 %18 to i32
	%incdec.ptr51 = getelementptr inbounds i16, i16* %b.addr.0100, i64 7			%incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
	%19 = load i16, i16* %incdec.ptr42, align 2			%19 = load i16, i16* %incdec.ptr42, align 2
	%conv52 = zext i16 %19 to i32			%conv52 = zext i16 %19 to i32
	%sub53 = sub nsw i32 %conv50, %conv52			%sub53 = sub nsw i32 %conv50, %conv52
	%arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53			%arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
	%20 = load i16, i16* %arrayidx55, align 2			%20 = load i16, i16* %arrayidx55, align 2
	%conv56 = zext i16 %20 to i32			%conv56 = zext i16 %20 to i32
	%add57 = add nsw i32 %add48, %conv56			%add57 = add nsw i32 %add48, %conv56
	%incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8			%incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
	%21 = load i16, i16* %incdec.ptr49, align 2			%21 = load i16, i16* %incdec.ptr49, align 2
	%conv59 = zext i16 %21 to i32			%conv59 = zext i16 %21 to i32
	%incdec.ptr60 = getelementptr inbounds i16, i16* %b.addr.0100, i64 8
	%22 = load i16, i16* %incdec.ptr51, align 2			%22 = load i16, i16* %incdec.ptr51, align 2
	%conv61 = zext i16 %22 to i32			%conv61 = zext i16 %22 to i32
	%sub62 = sub nsw i32 %conv59, %conv61			%sub62 = sub nsw i32 %conv59, %conv61
	%arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62			%arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
	%23 = load i16, i16* %arrayidx64, align 2			%23 = load i16, i16* %arrayidx64, align 2
	%conv65 = zext i16 %23 to i32			%conv65 = zext i16 %23 to i32
	%add66 = add nsw i32 %add57, %conv65			%add66 = add nsw i32 %add57, %conv65
	%inc = add nuw nsw i32 %i.0103, 1			%inc = add nuw nsw i32 %i.0103, 1
	%exitcond = icmp eq i32 %inc, %n			%exitcond = icmp eq i32 %inc, %n
	br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body			br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
	}			}

	; CHECK-LABEL: @gather_reduce_8x16_i64			; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64
	;			;
	; CHECK-NOT: load <8 x i16>			; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
				; UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32>
				; UNPROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
				; UNPROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
				; UNPROFITABLE: sext i32 [[X]] to i64
	;			;
	; FIXME: We are currently unable to vectorize the case with i64 subtraction			; TODO: Although we can now vectorize this case while converting the i64
	; because the zero extensions are too expensive. The solution here is to			; subtractions to i32, the cost model currently finds vectorization to be
	; convert the i64 subtractions to i32 subtractions during vectorization.			; unprofitable. The cost model is penalizing the sign and zero
	; This would then match the case above.			; extensions in the vectorized version, but they are actually free.
	;			;
	define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {			define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
	entry:			entry:
	%cmp.99 = icmp sgt i32 %n, 0			%cmp.99 = icmp sgt i32 %n, 0
	br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup			br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup

	for.body.preheader:			for.body.preheader:
	br label %for.body			br label %for.body

	for.cond.cleanup.loopexit:			for.cond.cleanup.loopexit:
	br label %for.cond.cleanup			br label %for.cond.cleanup

	for.cond.cleanup:			for.cond.cleanup:
	%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]			%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
	ret i32 %sum.0.lcssa			ret i32 %sum.0.lcssa

	for.body:			for.body:
	%i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]			%i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
	%sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]			%sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
	%a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]			%a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
	%b.addr.0100 = phi i16* [ %incdec.ptr60, %for.body ], [ %b, %for.body.preheader ]
	%incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1			%incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
	%0 = load i16, i16* %a.addr.0101, align 2			%0 = load i16, i16* %a.addr.0101, align 2
	%conv = zext i16 %0 to i64			%conv = zext i16 %0 to i64
	%incdec.ptr1 = getelementptr inbounds i16, i16* %b.addr.0100, i64 1			%incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
	%1 = load i16, i16* %b.addr.0100, align 2			%1 = load i16, i16* %b, align 2
	%conv2 = zext i16 %1 to i64			%conv2 = zext i16 %1 to i64
	%sub = sub nsw i64 %conv, %conv2			%sub = sub nsw i64 %conv, %conv2
	%arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub			%arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
	%2 = load i16, i16* %arrayidx, align 2			%2 = load i16, i16* %arrayidx, align 2
	%conv3 = zext i16 %2 to i32			%conv3 = zext i16 %2 to i32
	%add = add nsw i32 %conv3, %sum.0102			%add = add nsw i32 %conv3, %sum.0102
	%incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2			%incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
	%3 = load i16, i16* %incdec.ptr, align 2			%3 = load i16, i16* %incdec.ptr, align 2
	%conv5 = zext i16 %3 to i64			%conv5 = zext i16 %3 to i64
	%incdec.ptr6 = getelementptr inbounds i16, i16* %b.addr.0100, i64 2			%incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
	%4 = load i16, i16* %incdec.ptr1, align 2			%4 = load i16, i16* %incdec.ptr1, align 2
	%conv7 = zext i16 %4 to i64			%conv7 = zext i16 %4 to i64
	%sub8 = sub nsw i64 %conv5, %conv7			%sub8 = sub nsw i64 %conv5, %conv7
	%arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8			%arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
	%5 = load i16, i16* %arrayidx10, align 2			%5 = load i16, i16* %arrayidx10, align 2
	%conv11 = zext i16 %5 to i32			%conv11 = zext i16 %5 to i32
	%add12 = add nsw i32 %add, %conv11			%add12 = add nsw i32 %add, %conv11
	%incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3			%incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
	%6 = load i16, i16* %incdec.ptr4, align 2			%6 = load i16, i16* %incdec.ptr4, align 2
	%conv14 = zext i16 %6 to i64			%conv14 = zext i16 %6 to i64
	%incdec.ptr15 = getelementptr inbounds i16, i16* %b.addr.0100, i64 3			%incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
	%7 = load i16, i16* %incdec.ptr6, align 2			%7 = load i16, i16* %incdec.ptr6, align 2
	%conv16 = zext i16 %7 to i64			%conv16 = zext i16 %7 to i64
	%sub17 = sub nsw i64 %conv14, %conv16			%sub17 = sub nsw i64 %conv14, %conv16
	%arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17			%arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
	%8 = load i16, i16* %arrayidx19, align 2			%8 = load i16, i16* %arrayidx19, align 2
	%conv20 = zext i16 %8 to i32			%conv20 = zext i16 %8 to i32
	%add21 = add nsw i32 %add12, %conv20			%add21 = add nsw i32 %add12, %conv20
	%incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4			%incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
	%9 = load i16, i16* %incdec.ptr13, align 2			%9 = load i16, i16* %incdec.ptr13, align 2
	%conv23 = zext i16 %9 to i64			%conv23 = zext i16 %9 to i64
	%incdec.ptr24 = getelementptr inbounds i16, i16* %b.addr.0100, i64 4			%incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
	%10 = load i16, i16* %incdec.ptr15, align 2			%10 = load i16, i16* %incdec.ptr15, align 2
	%conv25 = zext i16 %10 to i64			%conv25 = zext i16 %10 to i64
	%sub26 = sub nsw i64 %conv23, %conv25			%sub26 = sub nsw i64 %conv23, %conv25
	%arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26			%arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
	%11 = load i16, i16* %arrayidx28, align 2			%11 = load i16, i16* %arrayidx28, align 2
	%conv29 = zext i16 %11 to i32			%conv29 = zext i16 %11 to i32
	%add30 = add nsw i32 %add21, %conv29			%add30 = add nsw i32 %add21, %conv29
	%incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5			%incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
	%12 = load i16, i16* %incdec.ptr22, align 2			%12 = load i16, i16* %incdec.ptr22, align 2
	%conv32 = zext i16 %12 to i64			%conv32 = zext i16 %12 to i64
	%incdec.ptr33 = getelementptr inbounds i16, i16* %b.addr.0100, i64 5			%incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
	%13 = load i16, i16* %incdec.ptr24, align 2			%13 = load i16, i16* %incdec.ptr24, align 2
	%conv34 = zext i16 %13 to i64			%conv34 = zext i16 %13 to i64
	%sub35 = sub nsw i64 %conv32, %conv34			%sub35 = sub nsw i64 %conv32, %conv34
	%arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35			%arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
	%14 = load i16, i16* %arrayidx37, align 2			%14 = load i16, i16* %arrayidx37, align 2
	%conv38 = zext i16 %14 to i32			%conv38 = zext i16 %14 to i32
	%add39 = add nsw i32 %add30, %conv38			%add39 = add nsw i32 %add30, %conv38
	%incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6			%incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
	%15 = load i16, i16* %incdec.ptr31, align 2			%15 = load i16, i16* %incdec.ptr31, align 2
	%conv41 = zext i16 %15 to i64			%conv41 = zext i16 %15 to i64
	%incdec.ptr42 = getelementptr inbounds i16, i16* %b.addr.0100, i64 6			%incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
	%16 = load i16, i16* %incdec.ptr33, align 2			%16 = load i16, i16* %incdec.ptr33, align 2
	%conv43 = zext i16 %16 to i64			%conv43 = zext i16 %16 to i64
	%sub44 = sub nsw i64 %conv41, %conv43			%sub44 = sub nsw i64 %conv41, %conv43
	%arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44			%arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
	%17 = load i16, i16* %arrayidx46, align 2			%17 = load i16, i16* %arrayidx46, align 2
	%conv47 = zext i16 %17 to i32			%conv47 = zext i16 %17 to i32
	%add48 = add nsw i32 %add39, %conv47			%add48 = add nsw i32 %add39, %conv47
	%incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7			%incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
	%18 = load i16, i16* %incdec.ptr40, align 2			%18 = load i16, i16* %incdec.ptr40, align 2
	%conv50 = zext i16 %18 to i64			%conv50 = zext i16 %18 to i64
	%incdec.ptr51 = getelementptr inbounds i16, i16* %b.addr.0100, i64 7			%incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
	%19 = load i16, i16* %incdec.ptr42, align 2			%19 = load i16, i16* %incdec.ptr42, align 2
	%conv52 = zext i16 %19 to i64			%conv52 = zext i16 %19 to i64
	%sub53 = sub nsw i64 %conv50, %conv52			%sub53 = sub nsw i64 %conv50, %conv52
	%arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53			%arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
	%20 = load i16, i16* %arrayidx55, align 2			%20 = load i16, i16* %arrayidx55, align 2
	%conv56 = zext i16 %20 to i32			%conv56 = zext i16 %20 to i32
	%add57 = add nsw i32 %add48, %conv56			%add57 = add nsw i32 %add48, %conv56
	%incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8			%incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
	%21 = load i16, i16* %incdec.ptr49, align 2			%21 = load i16, i16* %incdec.ptr49, align 2
	%conv59 = zext i16 %21 to i64			%conv59 = zext i16 %21 to i64
	%incdec.ptr60 = getelementptr inbounds i16, i16* %b.addr.0100, i64 8
	%22 = load i16, i16* %incdec.ptr51, align 2			%22 = load i16, i16* %incdec.ptr51, align 2
	%conv61 = zext i16 %22 to i64			%conv61 = zext i16 %22 to i64
	%sub62 = sub nsw i64 %conv59, %conv61			%sub62 = sub nsw i64 %conv59, %conv61
	%arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62			%arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
	%23 = load i16, i16* %arrayidx64, align 2			%23 = load i16, i16* %arrayidx64, align 2
	%conv65 = zext i16 %23 to i32			%conv65 = zext i16 %23 to i32
	%add66 = add nsw i32 %add57, %conv65			%add66 = add nsw i32 %add57, %conv65
	%inc = add nuw nsw i32 %i.0103, 1			%inc = add nuw nsw i32 %i.0103, 1
	%exitcond = icmp eq i32 %inc, %n			%exitcond = icmp eq i32 %inc, %n
	br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body			br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Truncate expressions to minimum required bit width
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 44212

lib/Transforms/Vectorize/SLPVectorizer.cpp

test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Truncate expressions to minimum required bit widthClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 44212

lib/Transforms/Vectorize/SLPVectorizer.cpp

test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll

[SLP] Truncate expressions to minimum required bit width
ClosedPublic