Diff 311613

llvm/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 935 Lines • ▼ Show 20 Lines	public:
/// size of the widest element type.		/// size of the widest element type.
bool shouldMaximizeVectorBandwidth(bool OptSize) const;		bool shouldMaximizeVectorBandwidth(bool OptSize) const;

/// \return The minimum vectorization factor for types of given element		/// \return The minimum vectorization factor for types of given element
/// bit width, or 0 if there is no minimum VF. The returned value only		/// bit width, or 0 if there is no minimum VF. The returned value only
/// applies when shouldMaximizeVectorBandwidth returns true.		/// applies when shouldMaximizeVectorBandwidth returns true.
unsigned getMinimumVF(unsigned ElemWidth) const;		unsigned getMinimumVF(unsigned ElemWidth) const;

		/// \return The maximum vectorization factor for types of given element
		/// bit width and opcode, or 0 if there is no maximum VF.
		/// Currently only used by the SLP vectorizer.
		unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;

/// \return True if it should be considered for address type promotion.		/// \return True if it should be considered for address type promotion.
/// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is		/// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
/// profitable without finding other extensions fed by the same input.		/// profitable without finding other extensions fed by the same input.
bool shouldConsiderAddressTypePromotion(		bool shouldConsiderAddressTypePromotion(
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;		const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;

/// \return The size of a cache line in bytes.		/// \return The size of a cache line in bytes.
unsigned getCacheLineSize() const;		unsigned getCacheLineSize() const;
▲ Show 20 Lines • Show All 541 Lines • ▼ Show 20 Lines	public:
virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;		virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
virtual unsigned getRegisterClassForType(bool Vector,		virtual unsigned getRegisterClassForType(bool Vector,
Type *Ty = nullptr) const = 0;		Type *Ty = nullptr) const = 0;
virtual const char *getRegisterClassName(unsigned ClassID) const = 0;		virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
virtual unsigned getRegisterBitWidth(bool Vector) const = 0;		virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
virtual unsigned getMinVectorRegisterBitWidth() = 0;		virtual unsigned getMinVectorRegisterBitWidth() = 0;
virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;		virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;		virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
		virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
virtual bool shouldConsiderAddressTypePromotion(		virtual bool shouldConsiderAddressTypePromotion(
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;		const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
virtual unsigned getCacheLineSize() const = 0;		virtual unsigned getCacheLineSize() const = 0;
virtual Optional<unsigned> getCacheSize(CacheLevel Level) const = 0;		virtual Optional<unsigned> getCacheSize(CacheLevel Level) const = 0;
virtual Optional<unsigned> getCacheAssociativity(CacheLevel Level) const = 0;		virtual Optional<unsigned> getCacheAssociativity(CacheLevel Level) const = 0;

/// \return How much before a load we should place the prefetch		/// \return How much before a load we should place the prefetch
/// instruction. This is currently measured in number of		/// instruction. This is currently measured in number of
▲ Show 20 Lines • Show All 403 Lines • ▼ Show 20 Lines	unsigned getMinVectorRegisterBitWidth() override {
return Impl.getMinVectorRegisterBitWidth();		return Impl.getMinVectorRegisterBitWidth();
}		}
bool shouldMaximizeVectorBandwidth(bool OptSize) const override {		bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
return Impl.shouldMaximizeVectorBandwidth(OptSize);		return Impl.shouldMaximizeVectorBandwidth(OptSize);
}		}
unsigned getMinimumVF(unsigned ElemWidth) const override {		unsigned getMinimumVF(unsigned ElemWidth) const override {
return Impl.getMinimumVF(ElemWidth);		return Impl.getMinimumVF(ElemWidth);
}		}
		unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override {
		return Impl.getMaximumVF(ElemWidth, Opcode);
		}
bool shouldConsiderAddressTypePromotion(		bool shouldConsiderAddressTypePromotion(
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {		const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
return Impl.shouldConsiderAddressTypePromotion(		return Impl.shouldConsiderAddressTypePromotion(
I, AllowPromotionWithoutCommonHeader);		I, AllowPromotionWithoutCommonHeader);
}		}
unsigned getCacheLineSize() const override { return Impl.getCacheLineSize(); }		unsigned getCacheLineSize() const override { return Impl.getCacheLineSize(); }
Optional<unsigned> getCacheSize(CacheLevel Level) const override {		Optional<unsigned> getCacheSize(CacheLevel Level) const override {
return Impl.getCacheSize(Level);		return Impl.getCacheSize(Level);
▲ Show 20 Lines • Show All 338 Lines • Show Last 20 Lines

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 350 Lines • ▼ Show 20 Lines	public:
unsigned getRegisterBitWidth(bool Vector) const { return 32; }		unsigned getRegisterBitWidth(bool Vector) const { return 32; }

unsigned getMinVectorRegisterBitWidth() { return 128; }		unsigned getMinVectorRegisterBitWidth() { return 128; }

bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }		bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }

unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }		unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }

		unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; }

bool		bool
shouldConsiderAddressTypePromotion(const Instruction &I,		shouldConsiderAddressTypePromotion(const Instruction &I,
bool &AllowPromotionWithoutCommonHeader) {		bool &AllowPromotionWithoutCommonHeader) {
AllowPromotionWithoutCommonHeader = false;		AllowPromotionWithoutCommonHeader = false;
return false;		return false;
}		}

unsigned getCacheLineSize() const { return 0; }		unsigned getCacheLineSize() const { return 0; }
▲ Show 20 Lines • Show All 713 Lines • Show Last 20 Lines

llvm/lib/Analysis/TargetTransformInfo.cpp

	Show First 20 Lines • Show All 629 Lines • ▼ Show 20 Lines
	bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {			bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
	return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);			return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
	}			}

	unsigned TargetTransformInfo::getMinimumVF(unsigned ElemWidth) const {			unsigned TargetTransformInfo::getMinimumVF(unsigned ElemWidth) const {
	return TTIImpl->getMinimumVF(ElemWidth);			return TTIImpl->getMinimumVF(ElemWidth);
	}			}

				unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth,
				unsigned Opcode) const {
				return TTIImpl->getMaximumVF(ElemWidth, Opcode);
				}

	bool TargetTransformInfo::shouldConsiderAddressTypePromotion(			bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
	const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {			const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
	return TTIImpl->shouldConsiderAddressTypePromotion(			return TTIImpl->shouldConsiderAddressTypePromotion(
	I, AllowPromotionWithoutCommonHeader);			I, AllowPromotionWithoutCommonHeader);
	}			}

	unsigned TargetTransformInfo::getCacheLineSize() const {			unsigned TargetTransformInfo::getCacheLineSize() const {
	return TTIImpl->getCacheLineSize();			return TTIImpl->getCacheLineSize();
	▲ Show 20 Lines • Show All 803 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Show First 20 Lines • Show All 164 Lines • ▼ Show 20 Lines	TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_FastHardware;		return TTI::PSK_FastHardware;
}		}

unsigned getHardwareNumberOfRegisters(bool Vector) const;		unsigned getHardwareNumberOfRegisters(bool Vector) const;
unsigned getNumberOfRegisters(bool Vector) const;		unsigned getNumberOfRegisters(bool Vector) const;
unsigned getNumberOfRegisters(unsigned RCID) const;		unsigned getNumberOfRegisters(unsigned RCID) const;
unsigned getRegisterBitWidth(bool Vector) const;		unsigned getRegisterBitWidth(bool Vector) const;
unsigned getMinVectorRegisterBitWidth() const;		unsigned getMinVectorRegisterBitWidth() const;
		unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,		unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const;		VectorType *VecTy) const;
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,		unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,		unsigned ChainSizeInBytes,
VectorType *VecTy) const;		VectorType *VecTy) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;		unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;

▲ Show 20 Lines • Show All 131 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

	Show First 20 Lines • Show All 282 Lines • ▼ Show 20 Lines
	unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {			unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
	return 32;			return 32;
	}			}

	unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {			unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
	return 32;			return 32;
	}			}

				unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
				if (Opcode == Instruction::Load \|\| Opcode == Instruction::Store)
				return 32 * 4 / ElemWidth;
				return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
				}

	unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,			unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
	unsigned ChainSizeInBytes,			unsigned ChainSizeInBytes,
	VectorType *VecTy) const {			VectorType *VecTy) const {
	unsigned VecRegBitWidth = VF * LoadSize;			unsigned VecRegBitWidth = VF * LoadSize;
	if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)			if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
	// TODO: Support element-size less than 32bit?			// TODO: Support element-size less than 32bit?
	return 128 / LoadSize;			return 128 / LoadSize;

	▲ Show 20 Lines • Show All 930 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 120 Lines • ▼ Show 20 Lines	static cl::opt<bool> ShouldStartVectorizeHorAtStore(
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,		"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
cl::desc(		cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));		"Attempt to vectorize horizontal reductions feeding into a store"));

static cl::opt<int>		static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,		MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));		cl::desc("Attempt to vectorize for this register size in bits"));

		static cl::opt<unsigned>
		jonpaUnsubmitted Done Reply Inline Actions maybe include "slp" in the description as well, to be really clear..? jonpa: maybe include "slp" in the description as well, to be really clear..?
		MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
		cl::desc("Maximum SLP vectorization factor (0=unlimited)"));

static cl::opt<int>		static cl::opt<int>
MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,		MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
cl::desc("Maximum depth of the lookup for consecutive stores."));		cl::desc("Maximum depth of the lookup for consecutive stores."));

/// Limits the size of scheduling regions in a block.		/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector		/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.		/// instructions are spread over a wide range.
/// This limit is way higher than needed by real-world functions.		/// This limit is way higher than needed by real-world functions.
▲ Show 20 Lines • Show All 599 Lines • ▼ Show 20 Lines	unsigned getMaxVecRegSize() const {
return MaxVecRegSize;		return MaxVecRegSize;
}		}

// \returns minimum vector register size as set by cl::opt.		// \returns minimum vector register size as set by cl::opt.
unsigned getMinVecRegSize() const {		unsigned getMinVecRegSize() const {
return MinVecRegSize;		return MinVecRegSize;
}		}

		unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
		jonpaUnsubmitted Not Done Reply Inline Actions Is there a reason for having the default TTI value of 0 instead of UINT_MAX directly as you first suggested? jonpa: Is there a reason for having the default TTI value of 0 instead of UINT_MAX directly as you…
		rampitecAuthorUnsubmitted Done Reply Inline Actions Yes, there is getMinimumVF() with default 0. This is just for the uniformity. rampitec: Yes, there is getMinimumVF() with default 0. This is just for the uniformity.
		unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
		MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
		return MaxVF ? MaxVF : UINT_MAX;
		}

/// Check if homogeneous aggregate is isomorphic to some VectorType.		/// Check if homogeneous aggregate is isomorphic to some VectorType.
/// Accepts homogeneous multidimensional aggregate of scalars/vectors like		/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },		/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.		/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
///		///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.		/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;		unsigned canMapToVector(Type *T, const DataLayout &DL) const;

▲ Show 20 Lines • Show All 5,434 Lines • ▼ Show 20 Lines	if (!isValidElementType(Ty)) {
});		});
return false;		return false;
}		}
}		}

unsigned Sz = R.getVectorElementSize(I0);		unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);		unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);		unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
		MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
if (MaxVF < 2) {		if (MaxVF < 2) {
R.getORE()->emit([&]() {		R.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)		return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
<< "Cannot SLP vectorize list: vectorization factor "		<< "Cannot SLP vectorize list: vectorization factor "
<< "less than 2 is not supported";		<< "less than 2 is not supported";
});		});
return false;		return false;
}		}
▲ Show 20 Lines • Show All 1,426 Lines • ▼ Show 20 Lines	bool SLPVectorizerPass::vectorizeSimpleInstructions(
Instructions.clear();		Instructions.clear();
return OpsChanged;		return OpsChanged;
}		}

bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {		bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;		bool Changed = false;
SmallVector<Value *, 4> Incoming;		SmallVector<Value *, 4> Incoming;
SmallPtrSet<Value *, 16> VisitedInstrs;		SmallPtrSet<Value *, 16> VisitedInstrs;
unsigned MaxVecRegSize = R.getMaxVecRegSize();

bool HaveVectorizedPhiNodes = true;		bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {		while (HaveVectorizedPhiNodes) {
HaveVectorizedPhiNodes = false;		HaveVectorizedPhiNodes = false;

// Collect the incoming values from the PHIs.		// Collect the incoming values from the PHIs.
Incoming.clear();		Incoming.clear();
for (Instruction &I : *BB) {		for (Instruction &I : *BB) {
Show All 10 Lines	while (HaveVectorizedPhiNodes) {

// Try to vectorize elements base on their type.		// Try to vectorize elements base on their type.
for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),		for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
E = Incoming.end();		E = Incoming.end();
IncIt != E;) {		IncIt != E;) {

// Look for the next elements with the same type.		// Look for the next elements with the same type.
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;		SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
Type EltTy = (IncIt)->getType();

assert(EltTy->isSized() &&
"Instructions should all be sized at this point");
TypeSize EltTS = DL->getTypeSizeInBits(EltTy);
if (EltTS.isScalable()) {
// For now, just ignore vectorizing scalable types.
++IncIt;
continue;
}

unsigned EltSize = EltTS.getFixedSize();
unsigned MaxNumElts = MaxVecRegSize / EltSize;
if (MaxNumElts < 2) {
++IncIt;
continue;
}

while (SameTypeIt != E &&		while (SameTypeIt != E &&
(*SameTypeIt)->getType() == EltTy &&		(SameTypeIt)->getType() == (IncIt)->getType()) {
static_cast<unsigned>(SameTypeIt - IncIt) < MaxNumElts) {
VisitedInstrs.insert(*SameTypeIt);		VisitedInstrs.insert(*SameTypeIt);
++SameTypeIt;		++SameTypeIt;
}		}

// Try to vectorize them.		// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);		unsigned NumElts = (SameTypeIt - IncIt);
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("		LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
<< NumElts << ")\n");		<< NumElts << ")\n");
▲ Show 20 Lines • Show All 236 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll

Show First 20 Lines • Show All 117 Lines • ▼ Show 20 Lines	bb:
%arg1.1 = extractelement <2 x i16> %arg1, i64 1		%arg1.1 = extractelement <2 x i16> %arg1, i64 1
%add.0 = call i16 @llvm.ssub.sat.i16(i16 %arg0.0, i16 %arg1.0)		%add.0 = call i16 @llvm.ssub.sat.i16(i16 %arg0.0, i16 %arg1.0)
%add.1 = call i16 @llvm.ssub.sat.i16(i16 %arg0.1, i16 %arg1.1)		%add.1 = call i16 @llvm.ssub.sat.i16(i16 %arg0.1, i16 %arg1.1)
%ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0		%ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0
%ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1		%ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
ret <2 x i16> %ins.1		ret <2 x i16> %ins.1
}		}

; FIXME: Should not vectorize
define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {		define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
; GCN-LABEL: @uadd_sat_v2i32(		; GCN-LABEL: @uadd_sat_v2i32(
; GCN-NEXT: bb:		; GCN-NEXT: bb:
; GCN-NEXT: [[TMP0:%.]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[ARG0:%.]], <2 x i32> [[ARG1:%.*]])		; GCN-NEXT: [[ARG0_0:%.]] = extractelement <2 x i32> [[ARG0:%.]], i64 0
; GCN-NEXT: ret <2 x i32> [[TMP0]]		; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
		; GCN-NEXT: [[ARG1_0:%.]] = extractelement <2 x i32> [[ARG1:%.]], i64 0
		; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
		; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
		; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
		; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
		; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
		; GCN-NEXT: ret <2 x i32> [[INS_1]]
;		;
bb:		bb:
%arg0.0 = extractelement <2 x i32> %arg0, i64 0		%arg0.0 = extractelement <2 x i32> %arg0, i64 0
%arg0.1 = extractelement <2 x i32> %arg0, i64 1		%arg0.1 = extractelement <2 x i32> %arg0, i64 1
%arg1.0 = extractelement <2 x i32> %arg1, i64 0		%arg1.0 = extractelement <2 x i32> %arg1, i64 0
%arg1.1 = extractelement <2 x i32> %arg1, i64 1		%arg1.1 = extractelement <2 x i32> %arg1, i64 1
%add.0 = call i32 @llvm.uadd.sat.i32(i32 %arg0.0, i32 %arg1.0)		%add.0 = call i32 @llvm.uadd.sat.i32(i32 %arg0.0, i32 %arg1.0)
%add.1 = call i32 @llvm.uadd.sat.i32(i32 %arg0.1, i32 %arg1.1)		%add.1 = call i32 @llvm.uadd.sat.i32(i32 %arg0.1, i32 %arg1.1)
%ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0		%ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
%ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1		%ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
ret <2 x i32> %ins.1		ret <2 x i32> %ins.1
}		}

define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {		define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
; GCN-LABEL: @usub_sat_v2i32(		; GCN-LABEL: @usub_sat_v2i32(
; GCN-NEXT: bb:		; GCN-NEXT: bb:
; GCN-NEXT: [[TMP0:%.]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[ARG0:%.]], <2 x i32> [[ARG1:%.*]])		; GCN-NEXT: [[ARG0_0:%.]] = extractelement <2 x i32> [[ARG0:%.]], i64 0
; GCN-NEXT: ret <2 x i32> [[TMP0]]		; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
		; GCN-NEXT: [[ARG1_0:%.]] = extractelement <2 x i32> [[ARG1:%.]], i64 0
		; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
		; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
		; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
		; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
		; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
		; GCN-NEXT: ret <2 x i32> [[INS_1]]
;		;
bb:		bb:
%arg0.0 = extractelement <2 x i32> %arg0, i64 0		%arg0.0 = extractelement <2 x i32> %arg0, i64 0
%arg0.1 = extractelement <2 x i32> %arg0, i64 1		%arg0.1 = extractelement <2 x i32> %arg0, i64 1
%arg1.0 = extractelement <2 x i32> %arg1, i64 0		%arg1.0 = extractelement <2 x i32> %arg1, i64 0
%arg1.1 = extractelement <2 x i32> %arg1, i64 1		%arg1.1 = extractelement <2 x i32> %arg1, i64 1
%add.0 = call i32 @llvm.usub.sat.i32(i32 %arg0.0, i32 %arg1.0)		%add.0 = call i32 @llvm.usub.sat.i32(i32 %arg0.0, i32 %arg1.0)
%add.1 = call i32 @llvm.usub.sat.i32(i32 %arg0.1, i32 %arg1.1)		%add.1 = call i32 @llvm.usub.sat.i32(i32 %arg0.1, i32 %arg1.1)
%ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0		%ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
%ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1		%ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
ret <2 x i32> %ins.1		ret <2 x i32> %ins.1
}		}

define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {		define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
; GCN-LABEL: @sadd_sat_v2i32(		; GCN-LABEL: @sadd_sat_v2i32(
; GCN-NEXT: bb:		; GCN-NEXT: bb:
; GCN-NEXT: [[TMP0:%.]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ARG0:%.]], <2 x i32> [[ARG1:%.*]])		; GCN-NEXT: [[ARG0_0:%.]] = extractelement <2 x i32> [[ARG0:%.]], i64 0
; GCN-NEXT: ret <2 x i32> [[TMP0]]		; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
		; GCN-NEXT: [[ARG1_0:%.]] = extractelement <2 x i32> [[ARG1:%.]], i64 0
		; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
		; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
		; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
		; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
		; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
		; GCN-NEXT: ret <2 x i32> [[INS_1]]
;		;
bb:		bb:
%arg0.0 = extractelement <2 x i32> %arg0, i64 0		%arg0.0 = extractelement <2 x i32> %arg0, i64 0
%arg0.1 = extractelement <2 x i32> %arg0, i64 1		%arg0.1 = extractelement <2 x i32> %arg0, i64 1
%arg1.0 = extractelement <2 x i32> %arg1, i64 0		%arg1.0 = extractelement <2 x i32> %arg1, i64 0
%arg1.1 = extractelement <2 x i32> %arg1, i64 1		%arg1.1 = extractelement <2 x i32> %arg1, i64 1
%add.0 = call i32 @llvm.sadd.sat.i32(i32 %arg0.0, i32 %arg1.0)		%add.0 = call i32 @llvm.sadd.sat.i32(i32 %arg0.0, i32 %arg1.0)
%add.1 = call i32 @llvm.sadd.sat.i32(i32 %arg0.1, i32 %arg1.1)		%add.1 = call i32 @llvm.sadd.sat.i32(i32 %arg0.1, i32 %arg1.1)
%ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0		%ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
%ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1		%ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
ret <2 x i32> %ins.1		ret <2 x i32> %ins.1
}		}

define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {		define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
; GCN-LABEL: @ssub_sat_v2i32(		; GCN-LABEL: @ssub_sat_v2i32(
; GCN-NEXT: bb:		; GCN-NEXT: bb:
; GCN-NEXT: [[TMP0:%.]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[ARG0:%.]], <2 x i32> [[ARG1:%.*]])		; GCN-NEXT: [[ARG0_0:%.]] = extractelement <2 x i32> [[ARG0:%.]], i64 0
; GCN-NEXT: ret <2 x i32> [[TMP0]]		; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
		; GCN-NEXT: [[ARG1_0:%.]] = extractelement <2 x i32> [[ARG1:%.]], i64 0
		; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
		; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
		; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
		; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
		; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
		; GCN-NEXT: ret <2 x i32> [[INS_1]]
;		;
bb:		bb:
%arg0.0 = extractelement <2 x i32> %arg0, i64 0		%arg0.0 = extractelement <2 x i32> %arg0, i64 0
%arg0.1 = extractelement <2 x i32> %arg0, i64 1		%arg0.1 = extractelement <2 x i32> %arg0, i64 1
%arg1.0 = extractelement <2 x i32> %arg1, i64 0		%arg1.0 = extractelement <2 x i32> %arg1, i64 0
%arg1.1 = extractelement <2 x i32> %arg1, i64 1		%arg1.1 = extractelement <2 x i32> %arg1, i64 1
%add.0 = call i32 @llvm.ssub.sat.i32(i32 %arg0.0, i32 %arg1.0)		%add.0 = call i32 @llvm.ssub.sat.i32(i32 %arg0.0, i32 %arg1.0)
%add.1 = call i32 @llvm.ssub.sat.i32(i32 %arg0.1, i32 %arg1.1)		%add.1 = call i32 @llvm.ssub.sat.i32(i32 %arg0.1, i32 %arg1.1)
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines
; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0		; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0
; GFX7-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1		; GFX7-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
; GFX7-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2		; GFX7-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
; GFX7-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3		; GFX7-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
; GFX7-NEXT: ret <4 x i16> [[INS_3]]		; GFX7-NEXT: ret <4 x i16> [[INS_3]]
;		;
; GFX8-LABEL: @uadd_sat_v4i16(		; GFX8-LABEL: @uadd_sat_v4i16(
; GFX8-NEXT: bb:		; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.]], <4 x i16> [[ARG1:%.*]])		; GFX8-NEXT: [[TMP0:%.]] = shufflevector <4 x i16> [[ARG0:%.]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: ret <4 x i16> [[TMP0]]		; GFX8-NEXT: [[TMP1:%.]] = shufflevector <4 x i16> [[ARG1:%.]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
		; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
		; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
		; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
		; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
		; GFX8-NEXT: [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
		; GFX8-NEXT: ret <4 x i16> [[INS_3]]
;		;
bb:		bb:
%arg0.0 = extractelement <4 x i16> %arg0, i64 0		%arg0.0 = extractelement <4 x i16> %arg0, i64 0
%arg0.1 = extractelement <4 x i16> %arg0, i64 1		%arg0.1 = extractelement <4 x i16> %arg0, i64 1
%arg0.2 = extractelement <4 x i16> %arg0, i64 2		%arg0.2 = extractelement <4 x i16> %arg0, i64 2
%arg0.3 = extractelement <4 x i16> %arg0, i64 3		%arg0.3 = extractelement <4 x i16> %arg0, i64 3
%arg1.0 = extractelement <4 x i16> %arg1, i64 0		%arg1.0 = extractelement <4 x i16> %arg1, i64 0
%arg1.1 = extractelement <4 x i16> %arg1, i64 1		%arg1.1 = extractelement <4 x i16> %arg1, i64 1
Show All 24 Lines

llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll

Show All 12 Lines	bb:
%tmp1 = tail call half @llvm.round.half(half %tmp)		%tmp1 = tail call half @llvm.round.half(half %tmp)
%tmp2 = insertelement <2 x half> undef, half %tmp1, i64 0		%tmp2 = insertelement <2 x half> undef, half %tmp1, i64 0
%tmp3 = extractelement <2 x half> %arg, i64 1		%tmp3 = extractelement <2 x half> %arg, i64 1
%tmp4 = tail call half @llvm.round.half(half %tmp3)		%tmp4 = tail call half @llvm.round.half(half %tmp3)
%tmp5 = insertelement <2 x half> %tmp2, half %tmp4, i64 1		%tmp5 = insertelement <2 x half> %tmp2, half %tmp4, i64 1
ret <2 x half> %tmp5		ret <2 x half> %tmp5
}		}

; TODO: Should probably not really be vectorizing this
; GCN-LABEL: @round_v2f32(		; GCN-LABEL: @round_v2f32(
; GCN: call <2 x float> @llvm.round.v2f32		; GCN: call float @llvm.round.f32(
		; GCN: call float @llvm.round.f32(
define <2 x float> @round_v2f32(<2 x float> %arg) {		define <2 x float> @round_v2f32(<2 x float> %arg) {
bb:		bb:
%tmp = extractelement <2 x float> %arg, i64 0		%tmp = extractelement <2 x float> %arg, i64 0
%tmp1 = tail call float @llvm.round.f32(float %tmp)		%tmp1 = tail call float @llvm.round.f32(float %tmp)
%tmp2 = insertelement <2 x float> undef, float %tmp1, i64 0		%tmp2 = insertelement <2 x float> undef, float %tmp1, i64 0
%tmp3 = extractelement <2 x float> %arg, i64 1		%tmp3 = extractelement <2 x float> %arg, i64 1
%tmp4 = tail call float @llvm.round.f32(float %tmp3)		%tmp4 = tail call float @llvm.round.f32(float %tmp3)
%tmp5 = insertelement <2 x float> %tmp2, float %tmp4, i64 1		%tmp5 = insertelement <2 x float> %tmp2, float %tmp4, i64 1
ret <2 x float> %tmp5		ret <2 x float> %tmp5
}		}

declare half @llvm.round.half(half) #0		declare half @llvm.round.half(half) #0
declare float @llvm.round.f32(float) #0		declare float @llvm.round.f32(float) #0

attributes #0 = { nounwind readnone speculatable willreturn }		attributes #0 = { nounwind readnone speculatable willreturn }

llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -slp-vectorizer -S -slp-max-reg-size=32 < %s \| FileCheck -check-prefix=MAX32 %s		; RUN: opt -slp-vectorizer -S -slp-max-vf=1 < %s \| FileCheck -check-prefix=MAX32 %s
; RUN: opt -slp-vectorizer -S -slp-max-reg-size=256 < %s \| FileCheck -check-prefix=MAX256 %s		; RUN: opt -slp-vectorizer -S -slp-max-vf=8 < %s \| FileCheck -check-prefix=MAX256 %s
; RUN: opt -slp-vectorizer -S -slp-max-reg-size=1024 < %s \| FileCheck -check-prefix=MAX1024 %s		; RUN: opt -slp-vectorizer -S -slp-max-vf=32 < %s \| FileCheck -check-prefix=MAX1024 %s
		; RUN: opt -slp-vectorizer -S < %s \| FileCheck -check-prefix=MAX1024 %s

		; Make sure we do not vectorize to create PHI wider than requested.
		; On AMDGPU target wider vectorization will result in a higher register pressure,
		; spilling, or even inability to allocate registers.

define void @phi_float32(half %hval, float %fval) {		define void @phi_float32(half %hval, float %fval) {
; MAX32-LABEL: @phi_float32(		; MAX32-LABEL: @phi_float32(
; MAX32-NEXT: bb:		; MAX32-NEXT: bb:
; MAX32-NEXT: br label [[BB1:%.*]]		; MAX32-NEXT: br label [[BB1:%.*]]
; MAX32: bb1:		; MAX32: bb1:
; MAX32-NEXT: [[I:%.]] = fpext half [[HVAL:%.]] to float		; MAX32-NEXT: [[I:%.]] = fpext half [[HVAL:%.]] to float
; MAX32-NEXT: [[I1:%.]] = fmul float [[I]], [[FVAL:%.]]		; MAX32-NEXT: [[I1:%.]] = fmul float [[I]], [[FVAL:%.]]
▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines
; MAX32-NEXT: [[PHI25:%.*]] = phi float [ [[I53]], [[BB3]] ], [ [[I53]], [[BB4]] ], [ [[I53]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]		; MAX32-NEXT: [[PHI25:%.*]] = phi float [ [[I53]], [[BB3]] ], [ [[I53]], [[BB4]] ], [ [[I53]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]
; MAX32-NEXT: [[PHI26:%.*]] = phi float [ [[I55]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]		; MAX32-NEXT: [[PHI26:%.*]] = phi float [ [[I55]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]
; MAX32-NEXT: [[PHI27:%.*]] = phi float [ [[I57]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[I57]], [[BB5]] ], [ [[I57]], [[BB1]] ]		; MAX32-NEXT: [[PHI27:%.*]] = phi float [ [[I57]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[I57]], [[BB5]] ], [ [[I57]], [[BB1]] ]
; MAX32-NEXT: [[PHI28:%.*]] = phi float [ [[I59]], [[BB3]] ], [ [[I59]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[I59]], [[BB1]] ]		; MAX32-NEXT: [[PHI28:%.*]] = phi float [ [[I59]], [[BB3]] ], [ [[I59]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[I59]], [[BB1]] ]
; MAX32-NEXT: [[PHI29:%.*]] = phi float [ [[I61]], [[BB3]] ], [ [[I61]], [[BB4]] ], [ [[I61]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]		; MAX32-NEXT: [[PHI29:%.*]] = phi float [ [[I61]], [[BB3]] ], [ [[I61]], [[BB4]] ], [ [[I61]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]
; MAX32-NEXT: [[PHI30:%.*]] = phi float [ [[I63]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]		; MAX32-NEXT: [[PHI30:%.*]] = phi float [ [[I63]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]
; MAX32-NEXT: [[PHI31:%.*]] = phi float [ [[I65]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[I65]], [[BB5]] ], [ [[I65]], [[BB1]] ]		; MAX32-NEXT: [[PHI31:%.*]] = phi float [ [[I65]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[I65]], [[BB5]] ], [ [[I65]], [[BB1]] ]
; MAX32-NEXT: [[PHI32:%.*]] = phi float [ [[I67]], [[BB3]] ], [ [[I67]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[I67]], [[BB1]] ]		; MAX32-NEXT: [[PHI32:%.*]] = phi float [ [[I67]], [[BB3]] ], [ [[I67]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[I67]], [[BB1]] ]
		; MAX32-NEXT: store float [[PHI31]], float* undef, align 4
; MAX32-NEXT: ret void		; MAX32-NEXT: ret void
;		;
; MAX256-LABEL: @phi_float32(		; MAX256-LABEL: @phi_float32(
; MAX256-NEXT: bb:		; MAX256-NEXT: bb:
; MAX256-NEXT: br label [[BB1:%.*]]		; MAX256-NEXT: br label [[BB1:%.*]]
; MAX256: bb1:		; MAX256: bb1:
; MAX256-NEXT: [[TMP0:%.]] = insertelement <4 x half> undef, half [[HVAL:%.]], i32 0		; MAX256-NEXT: [[TMP0:%.]] = insertelement <4 x half> undef, half [[HVAL:%.]], i32 0
; MAX256-NEXT: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1		; MAX256-NEXT: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1
▲ Show 20 Lines • Show All 160 Lines • ▼ Show 20 Lines
; MAX256-NEXT: [[TMP151:%.*]] = insertelement <8 x float> [[TMP150]], float [[TMP71]], i32 6		; MAX256-NEXT: [[TMP151:%.*]] = insertelement <8 x float> [[TMP150]], float [[TMP71]], i32 6
; MAX256-NEXT: [[TMP152:%.*]] = insertelement <8 x float> [[TMP151]], float [[FVAL]], i32 7		; MAX256-NEXT: [[TMP152:%.*]] = insertelement <8 x float> [[TMP151]], float [[FVAL]], i32 7
; MAX256-NEXT: br label [[BB2]]		; MAX256-NEXT: br label [[BB2]]
; MAX256: bb2:		; MAX256: bb2:
; MAX256-NEXT: [[TMP153:%.*]] = phi <8 x float> [ [[TMP14]], [[BB3]] ], [ [[TMP83]], [[BB4]] ], [ [[TMP122]], [[BB5]] ], [ [[TMP44]], [[BB1]] ]		; MAX256-NEXT: [[TMP153:%.*]] = phi <8 x float> [ [[TMP14]], [[BB3]] ], [ [[TMP83]], [[BB4]] ], [ [[TMP122]], [[BB5]] ], [ [[TMP44]], [[BB1]] ]
; MAX256-NEXT: [[TMP154:%.*]] = phi <8 x float> [ [[TMP28]], [[BB3]] ], [ [[TMP93]], [[BB4]] ], [ [[TMP132]], [[BB5]] ], [ [[TMP54]], [[BB1]] ]		; MAX256-NEXT: [[TMP154:%.*]] = phi <8 x float> [ [[TMP28]], [[BB3]] ], [ [[TMP93]], [[BB4]] ], [ [[TMP132]], [[BB5]] ], [ [[TMP54]], [[BB1]] ]
; MAX256-NEXT: [[TMP155:%.*]] = phi <8 x float> [ [[TMP30]], [[BB3]] ], [ [[TMP103]], [[BB4]] ], [ [[TMP142]], [[BB5]] ], [ [[TMP64]], [[BB1]] ]		; MAX256-NEXT: [[TMP155:%.*]] = phi <8 x float> [ [[TMP30]], [[BB3]] ], [ [[TMP103]], [[BB4]] ], [ [[TMP142]], [[BB5]] ], [ [[TMP64]], [[BB1]] ]
; MAX256-NEXT: [[TMP156:%.*]] = phi <8 x float> [ [[TMP32]], [[BB3]] ], [ [[TMP113]], [[BB4]] ], [ [[TMP152]], [[BB5]] ], [ [[TMP74]], [[BB1]] ]		; MAX256-NEXT: [[TMP156:%.*]] = phi <8 x float> [ [[TMP32]], [[BB3]] ], [ [[TMP113]], [[BB4]] ], [ [[TMP152]], [[BB5]] ], [ [[TMP74]], [[BB1]] ]
		; MAX256-NEXT: [[TMP157:%.*]] = extractelement <8 x float> [[TMP156]], i32 6
		; MAX256-NEXT: store float [[TMP157]], float* undef, align 4
; MAX256-NEXT: ret void		; MAX256-NEXT: ret void
;		;
; MAX1024-LABEL: @phi_float32(		; MAX1024-LABEL: @phi_float32(
; MAX1024-NEXT: bb:		; MAX1024-NEXT: bb:
; MAX1024-NEXT: br label [[BB1:%.*]]		; MAX1024-NEXT: br label [[BB1:%.*]]
; MAX1024: bb1:		; MAX1024: bb1:
; MAX1024-NEXT: [[TMP0:%.]] = insertelement <4 x half> undef, half [[HVAL:%.]], i32 0		; MAX1024-NEXT: [[TMP0:%.]] = insertelement <4 x half> undef, half [[HVAL:%.]], i32 0
; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1		; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1
▲ Show 20 Lines • Show All 169 Lines • ▼ Show 20 Lines
; MAX1024-NEXT: [[TMP160:%.*]] = extractelement <32 x float> [[TMP38]], i32 28		; MAX1024-NEXT: [[TMP160:%.*]] = extractelement <32 x float> [[TMP38]], i32 28
; MAX1024-NEXT: [[TMP161:%.*]] = insertelement <32 x float> [[TMP159]], float [[TMP160]], i32 28		; MAX1024-NEXT: [[TMP161:%.*]] = insertelement <32 x float> [[TMP159]], float [[TMP160]], i32 28
; MAX1024-NEXT: [[TMP162:%.*]] = insertelement <32 x float> [[TMP161]], float [[FVAL]], i32 29		; MAX1024-NEXT: [[TMP162:%.*]] = insertelement <32 x float> [[TMP161]], float [[FVAL]], i32 29
; MAX1024-NEXT: [[TMP163:%.*]] = insertelement <32 x float> [[TMP162]], float [[TMP83]], i32 30		; MAX1024-NEXT: [[TMP163:%.*]] = insertelement <32 x float> [[TMP162]], float [[TMP83]], i32 30
; MAX1024-NEXT: [[TMP164:%.*]] = insertelement <32 x float> [[TMP163]], float [[FVAL]], i32 31		; MAX1024-NEXT: [[TMP164:%.*]] = insertelement <32 x float> [[TMP163]], float [[FVAL]], i32 31
; MAX1024-NEXT: br label [[BB2]]		; MAX1024-NEXT: br label [[BB2]]
; MAX1024: bb2:		; MAX1024: bb2:
; MAX1024-NEXT: [[TMP165:%.*]] = phi <32 x float> [ [[TMP38]], [[BB3]] ], [ [[TMP125]], [[BB4]] ], [ [[TMP164]], [[BB5]] ], [ [[TMP86]], [[BB1]] ]		; MAX1024-NEXT: [[TMP165:%.*]] = phi <32 x float> [ [[TMP38]], [[BB3]] ], [ [[TMP125]], [[BB4]] ], [ [[TMP164]], [[BB5]] ], [ [[TMP86]], [[BB1]] ]
		; MAX1024-NEXT: [[TMP166:%.*]] = extractelement <32 x float> [[TMP165]], i32 30
		; MAX1024-NEXT: store float [[TMP166]], float* undef, align 4
; MAX1024-NEXT: ret void		; MAX1024-NEXT: ret void
;		;
bb:		bb:
br label %bb1		br label %bb1

bb1:		bb1:
%i = fpext half %hval to float		%i = fpext half %hval to float
%i1 = fmul float %i, %fval		%i1 = fmul float %i, %fval
▲ Show 20 Lines • Show All 106 Lines • ▼ Show 20 Lines	bb2:
%phi25 = phi float [ %i53, %bb3 ], [ %i53, %bb4 ], [ %i53, %bb5 ], [ %fval, %bb1 ]		%phi25 = phi float [ %i53, %bb3 ], [ %i53, %bb4 ], [ %i53, %bb5 ], [ %fval, %bb1 ]
%phi26 = phi float [ %i55, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ]		%phi26 = phi float [ %i55, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ]
%phi27 = phi float [ %i57, %bb3 ], [ %fval, %bb4 ], [ %i57, %bb5 ], [ %i57, %bb1 ]		%phi27 = phi float [ %i57, %bb3 ], [ %fval, %bb4 ], [ %i57, %bb5 ], [ %i57, %bb1 ]
%phi28 = phi float [ %i59, %bb3 ], [ %i59, %bb4 ], [ %fval, %bb5 ], [ %i59, %bb1 ]		%phi28 = phi float [ %i59, %bb3 ], [ %i59, %bb4 ], [ %fval, %bb5 ], [ %i59, %bb1 ]
%phi29 = phi float [ %i61, %bb3 ], [ %i61, %bb4 ], [ %i61, %bb5 ], [ %fval, %bb1 ]		%phi29 = phi float [ %i61, %bb3 ], [ %i61, %bb4 ], [ %i61, %bb5 ], [ %fval, %bb1 ]
%phi30 = phi float [ %i63, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ]		%phi30 = phi float [ %i63, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ]
%phi31 = phi float [ %i65, %bb3 ], [ %fval, %bb4 ], [ %i65, %bb5 ], [ %i65, %bb1 ]		%phi31 = phi float [ %i65, %bb3 ], [ %fval, %bb4 ], [ %i65, %bb5 ], [ %i65, %bb1 ]
%phi32 = phi float [ %i67, %bb3 ], [ %i67, %bb4 ], [ %fval, %bb5 ], [ %i67, %bb1 ]		%phi32 = phi float [ %i67, %bb3 ], [ %i67, %bb4 ], [ %fval, %bb5 ], [ %i67, %bb1 ]
		store float %phi31, float* undef
ret void		ret void
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Control maximum vectorization factor from TTI
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 311613

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll

llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll

llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Control maximum vectorization factor from TTIClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 311613

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll

llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll

llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll

[SLP] Control maximum vectorization factor from TTI
ClosedPublic