Diff 221573

include/llvm/Transforms/Vectorize/SLPVectorizer.h

Show First 20 Lines • Show All 132 Lines • ▼ Show 20 Lines	private:
bool vectorizeSimpleInstructions(SmallVectorImpl<Instruction *> &Instructions,		bool vectorizeSimpleInstructions(SmallVectorImpl<Instruction *> &Instructions,
BasicBlock *BB, slpvectorizer::BoUpSLP &R);		BasicBlock *BB, slpvectorizer::BoUpSLP &R);

/// Scan the basic block and look for patterns that are likely to start		/// Scan the basic block and look for patterns that are likely to start
/// a vectorization chain.		/// a vectorization chain.
bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);		bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);

bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,		bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
unsigned VecRegSize);		unsigned Idx);

bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);		bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);

/// The store instructions in a basic block organized by base pointer.		/// The store instructions in a basic block organized by base pointer.
StoreListMap Stores;		StoreListMap Stores;

/// The getelementptr instructions in a basic block organized by base pointer.		/// The getelementptr instructions in a basic block organized by base pointer.
GEPListMap GEPs;		GEPListMap GEPs;
};		};

} // end namespace llvm		} // end namespace llvm

#endif // LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H		#endif // LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H

lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,272 Lines • ▼ Show 20 Lines	if (Changed) {
R.optimizeGatherSequence();		R.optimizeGatherSequence();
LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");		LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
LLVM_DEBUG(verifyFunction(F));		LLVM_DEBUG(verifyFunction(F));
}		}
return Changed;		return Changed;
}		}

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,		bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned VecRegSize) {		unsigned Idx) {
const unsigned ChainLen = Chain.size();		const unsigned ChainLen = Chain.size();
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen		LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
<< "\n");		<< "\n");
const unsigned Sz = R.getVectorElementSize(Chain[0]);		const unsigned Sz = R.getVectorElementSize(Chain[0]);
const unsigned VF = VecRegSize / Sz;		const unsigned MinVF = R.getMinVecRegSize() / Sz;
		unsigned VF = Chain.size();

if (!isPowerOf2_32(Sz) \|\| VF < 2)		if (!isPowerOf2_32(Sz) \|\| !isPowerOf2_32(VF) \|\| VF < 2 \|\| VF < MinVF)
return false;		return false;

bool Changed = false;		LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
// Look for profitable vectorizable trees at all offsets, starting at zero.
for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {

ArrayRef<Value *> Operands = Chain.slice(i, VF);
// Check that a previous iteration of this loop did not delete the Value.
if (llvm::any_of(Operands, [&R](Value *V) {
auto *I = dyn_cast<Instruction>(V);
return I && R.isDeleted(I);
}))
continue;

LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
<< "\n");		<< "\n");

R.buildTree(Operands);		R.buildTree(Chain);
if (R.isTreeTinyAndNotFullyVectorizable())		if (R.isTreeTinyAndNotFullyVectorizable())
continue;		return false;

R.computeMinimumValueSizes();		R.computeMinimumValueSizes();

int Cost = R.getTreeCost();		int Cost = R.getTreeCost();

LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF		LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
<< "\n");
if (Cost < -SLPCostThreshold) {		if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");		LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");

using namespace ore;		using namespace ore;

R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",		R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
cast<StoreInst>(Chain[i]))		cast<StoreInst>(Chain[0]))
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)		<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
<< " and with tree size "		<< " and with tree size "
<< NV("TreeSize", R.getTreeSize()));		<< NV("TreeSize", R.getTreeSize()));

R.vectorizeTree();		R.vectorizeTree();
		return true;
// Move to the next bundle.
i += VF - 1;
Changed = true;
}
}		}

return Changed;		return false;
}		}

bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,		bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP &R) {		BoUpSLP &R) {
SetVector<StoreInst *> Heads;
SmallDenseSet<StoreInst *> Tails;
SmallDenseMap<StoreInst , StoreInst > ConsecutiveChain;

// We may run into multiple chains that merge into a single chain. We mark the		// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.		// stores that we vectorized so that we don't visit the same store twice.
BoUpSLP::ValueSet VectorizedStores;		BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;		bool Changed = false;

auto &&FindConsecutiveAccess =		int E = Stores.size();
		lebedev.riUnsubmitted Not Done Reply Inline Actions The `std::max()` is not a typo? Might warrant a comment. lebedev.ri: The `std::max()` is not a typo? Might warrant a comment.
		ABataevAuthorUnsubmitted Not Done Reply Inline Actions Yes, it is a typo, must be `min`, thanks! ABataev: Yes, it is a typo, must be `min`, thanks!
		lebedev.riUnsubmitted Done Reply Inline Actions Then you probably want to add a test :) lebedev.ri: Then you //probably// want to add a test :)
[this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) {		SmallVector<bool, 16> Tails(E, false);
		RKSimonUnsubmitted Not Done Reply Inline Actions Better to use SmallBitVector or APInt ? RKSimon: Better to use SmallBitVector or APInt ?
		SmallVector<int, 16> ConsecutiveChain(E, E + 1);
		auto &&FindConsecutiveAccess = [this, &Stores, &Tails,
		&ConsecutiveChain](int K, int Idx) {
if (!isConsecutiveAccess(Stores[K], Stores[Idx], DL, SE))		if (!isConsecutiveAccess(Stores[K], Stores[Idx], DL, SE))
return false;		return false;

Tails.insert(Stores[Idx]);		Tails[Idx] = true;
Heads.insert(Stores[K]);		ConsecutiveChain[K] = Idx;
ConsecutiveChain[Stores[K]] = Stores[Idx];
return true;		return true;
};		};

// Do a quadratic search on all of the given stores in reverse order and find		// Do a quadratic search on all of the given stores in reverse order and find
// all of the pairs of stores that follow each other.		// all of the pairs of stores that follow each other.
int E = Stores.size();
for (int Idx = E - 1; Idx >= 0; --Idx) {		for (int Idx = E - 1; Idx >= 0; --Idx) {
// If a store has multiple consecutive store candidates, search according		// If a store has multiple consecutive store candidates, search according
// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...		// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
// This is because usually pairing with immediate succeeding or preceding		// This is because usually pairing with immediate succeeding or preceding
// candidate create the best chance to find slp vectorization opportunity.		// candidate create the best chance to find slp vectorization opportunity.
for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset)		const int MaxLookDepth = std::min(E - Idx, 16);
		for (int Offset = 1, F = std::max(MaxLookDepth, Idx + 1); Offset < F;
		++Offset)
if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) \|\|		if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) \|\|
(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))		(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
break;		break;
}		}

// For stores that start but don't end a link in the chain:		// For stores that start but don't end a link in the chain:
for (auto *SI : llvm::reverse(Heads)) {		for (int Cnt = E; Cnt > 0; --Cnt) {
if (Tails.count(SI))		int I = Cnt - 1;
		if (ConsecutiveChain[I] == E + 1 \|\| Tails[I])
continue;		continue;

// We found a store instr that starts a chain. Now follow the chain and try		// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.		// to vectorize it.
BoUpSLP::ValueList Operands;		BoUpSLP::ValueList Operands;
StoreInst *I = SI;
// Collect the chain into a list.		// Collect the chain into a list.
while ((Tails.count(I) \|\| Heads.count(I)) && !VectorizedStores.count(I)) {		while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
Operands.push_back(I);		Operands.push_back(Stores[I]);
// Move to the next value in the chain.		// Move to the next value in the chain.
I = ConsecutiveChain[I];		I = ConsecutiveChain[I];
}		}

// FIXME: Is division-by-2 the correct step? Should we assert that the		// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?		// register size is a power-of-2?
for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();		unsigned StartIdx = 0;
		for (unsigned Size = llvm::PowerOf2Ceil(Operands.size()); Size >= 2;
		RKSimonUnsubmitted Not Done Reply Inline Actions Would it be better to use: if ((MaxVecRegSize % EltSize) != 0) RKSimon: Would it be better to use: ``` if ((MaxVecRegSize % EltSize) != 0) ```
Size /= 2) {		Size /= 2) {
if (vectorizeStoreChain(Operands, R, Size)) {		for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
		ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
		if (!VectorizedStores.count(Slice.front()) &&
		!VectorizedStores.count(Slice.back()) &&
		vectorizeStoreChain(Slice, R, Cnt)) {
// Mark the vectorized stores so that we don't vectorize them again.		// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Operands.begin(), Operands.end());		VectorizedStores.insert(Slice.begin(), Slice.end());
Changed = true;		Changed = true;
break;		// If we vectorized initial block, no need to try to vectorize it
		// again.
		if (Cnt == StartIdx)
		StartIdx += Size;
		Cnt += Size;
		continue;
}		}
		++Cnt;
		}
		// Check if the whole array was vectorized already - exit.
		if (StartIdx >= Operands.size())
		break;
}		}
}		}

return Changed;		return Changed;
}		}

void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {		void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
// Initialize the collections. We will make a single pass over the block.		// Initialize the collections. We will make a single pass over the block.
▲ Show 20 Lines • Show All 1,643 Lines • ▼ Show 20 Lines	bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;		for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
++it) {		++it) {
if (it->second.size() < 2)		if (it->second.size() < 2)
continue;		continue;

LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "		LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n");		<< it->second.size() << ".\n");

// Process the stores in chunks of 16.		Changed \|= vectorizeStores(it->second, R);
// TODO: The limit of 16 inhibits greater vectorization factors.
// For example, AVX2 supports v32i8. Increasing this limit, however,
// may cause a significant compile-time increase.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed \|= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
}
}		}
return Changed;		return Changed;
}		}

char SLPVectorizer::ID = 0;		char SLPVectorizer::ID = 0;

static const char lv_name[] = "SLP Vectorizer";		static const char lv_name[] = "SLP Vectorizer";

Show All 11 Lines

test/Transforms/LoopVectorize/X86/metadata-enable.ll

	Show First 20 Lines • Show All 1,766 Lines • ▼ Show 20 Lines
	; O3-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48			; O3-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
	; O3-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0			; O3-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
	; O3: for.end:			; O3: for.end:
	; O3-NEXT: [[TMP1:%.]] = load i32, i32 [[A]], align 4			; O3-NEXT: [[TMP1:%.]] = load i32, i32 [[A]], align 4
	; O3-NEXT: ret i32 [[TMP1]]			; O3-NEXT: ret i32 [[TMP1]]
	;			;
	; O3DEFAULT-LABEL: @disabled(			; O3DEFAULT-LABEL: @disabled(
	; O3DEFAULT-NEXT: entry:			; O3DEFAULT-NEXT: entry:
	; O3DEFAULT-NEXT: [[TMP0:%.]] = bitcast i32 [[B:%.]] to <4 x i32>			; O3DEFAULT-NEXT: [[TMP0:%.]] = bitcast i32 [[B:%.]] to <32 x i32>
	; O3DEFAULT-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> [[TMP0]], align 4			; O3DEFAULT-NEXT: [[TMP1:%.]] = load <32 x i32>, <32 x i32> [[TMP0]], align 4
	; O3DEFAULT-NEXT: [[TMP2:%.]] = insertelement <4 x i32> undef, i32 [[N:%.]], i32 0			; O3DEFAULT-NEXT: [[TMP2:%.]] = insertelement <32 x i32> undef, i32 [[N:%.]], i32 0
	; O3DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer			; O3DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> zeroinitializer
	; O3DEFAULT-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP1]], [[TMP3]]			; O3DEFAULT-NEXT: [[TMP4:%.*]] = add nsw <32 x i32> [[TMP1]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP5:%.]] = bitcast i32 [[A:%.]] to <4 x i32>			; O3DEFAULT-NEXT: [[TMP5:%.]] = bitcast i32 [[A:%.]] to <32 x i32>
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4			; O3DEFAULT-NEXT: store <32 x i32> [[TMP4]], <32 x i32>* [[TMP5]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_4:%.]] = getelementptr inbounds i32, i32 [[B]], i64 4
	; O3DEFAULT-NEXT: [[ARRAYIDX2_4:%.]] = getelementptr inbounds i32, i32 [[A]], i64 4
	; O3DEFAULT-NEXT: [[TMP6:%.]] = bitcast i32 [[ARRAYIDX_4]] to <4 x i32>*
	; O3DEFAULT-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[TMP6]], align 4
	; O3DEFAULT-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP9:%.]] = bitcast i32 [[ARRAYIDX2_4]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_8:%.]] = getelementptr inbounds i32, i32 [[B]], i64 8
	; O3DEFAULT-NEXT: [[ARRAYIDX2_8:%.]] = getelementptr inbounds i32, i32 [[A]], i64 8
	; O3DEFAULT-NEXT: [[TMP10:%.]] = bitcast i32 [[ARRAYIDX_8]] to <4 x i32>*
	; O3DEFAULT-NEXT: [[TMP11:%.]] = load <4 x i32>, <4 x i32> [[TMP10]], align 4
	; O3DEFAULT-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP13:%.]] = bitcast i32 [[ARRAYIDX2_8]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_12:%.]] = getelementptr inbounds i32, i32 [[B]], i64 12
	; O3DEFAULT-NEXT: [[ARRAYIDX2_12:%.]] = getelementptr inbounds i32, i32 [[A]], i64 12
	; O3DEFAULT-NEXT: [[TMP14:%.]] = bitcast i32 [[ARRAYIDX_12]] to <4 x i32>*
	; O3DEFAULT-NEXT: [[TMP15:%.]] = load <4 x i32>, <4 x i32> [[TMP14]], align 4
	; O3DEFAULT-NEXT: [[TMP16:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP17:%.]] = bitcast i32 [[ARRAYIDX2_12]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_16:%.]] = getelementptr inbounds i32, i32 [[B]], i64 16
	; O3DEFAULT-NEXT: [[ARRAYIDX2_16:%.]] = getelementptr inbounds i32, i32 [[A]], i64 16
	; O3DEFAULT-NEXT: [[TMP18:%.]] = bitcast i32 [[ARRAYIDX_16]] to <4 x i32>*
	; O3DEFAULT-NEXT: [[TMP19:%.]] = load <4 x i32>, <4 x i32> [[TMP18]], align 4
	; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[TMP19]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP21:%.]] = bitcast i32 [[ARRAYIDX2_16]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP21]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_20:%.]] = getelementptr inbounds i32, i32 [[B]], i64 20
	; O3DEFAULT-NEXT: [[ARRAYIDX2_20:%.]] = getelementptr inbounds i32, i32 [[A]], i64 20
	; O3DEFAULT-NEXT: [[TMP22:%.]] = bitcast i32 [[ARRAYIDX_20]] to <4 x i32>*
	; O3DEFAULT-NEXT: [[TMP23:%.]] = load <4 x i32>, <4 x i32> [[TMP22]], align 4
	; O3DEFAULT-NEXT: [[TMP24:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP25:%.]] = bitcast i32 [[ARRAYIDX2_20]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_24:%.]] = getelementptr inbounds i32, i32 [[B]], i64 24
	; O3DEFAULT-NEXT: [[ARRAYIDX2_24:%.]] = getelementptr inbounds i32, i32 [[A]], i64 24
	; O3DEFAULT-NEXT: [[TMP26:%.]] = bitcast i32 [[ARRAYIDX_24]] to <4 x i32>*
	; O3DEFAULT-NEXT: [[TMP27:%.]] = load <4 x i32>, <4 x i32> [[TMP26]], align 4
	; O3DEFAULT-NEXT: [[TMP28:%.*]] = add nsw <4 x i32> [[TMP27]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP29:%.]] = bitcast i32 [[ARRAYIDX2_24]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_28:%.]] = getelementptr inbounds i32, i32 [[B]], i64 28
	; O3DEFAULT-NEXT: [[ARRAYIDX2_28:%.]] = getelementptr inbounds i32, i32 [[A]], i64 28
	; O3DEFAULT-NEXT: [[TMP30:%.]] = bitcast i32 [[ARRAYIDX_28]] to <4 x i32>*
	; O3DEFAULT-NEXT: [[TMP31:%.]] = load <4 x i32>, <4 x i32> [[TMP30]], align 4
	; O3DEFAULT-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[TMP31]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP33:%.]] = bitcast i32 [[ARRAYIDX2_28]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP32]], <4 x i32>* [[TMP33]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_32:%.]] = getelementptr inbounds i32, i32 [[B]], i64 32			; O3DEFAULT-NEXT: [[ARRAYIDX_32:%.]] = getelementptr inbounds i32, i32 [[B]], i64 32
	; O3DEFAULT-NEXT: [[ARRAYIDX2_32:%.]] = getelementptr inbounds i32, i32 [[A]], i64 32			; O3DEFAULT-NEXT: [[ARRAYIDX2_32:%.]] = getelementptr inbounds i32, i32 [[A]], i64 32
	; O3DEFAULT-NEXT: [[TMP34:%.]] = bitcast i32 [[ARRAYIDX_32]] to <4 x i32>*			; O3DEFAULT-NEXT: [[TMP6:%.]] = bitcast i32 [[ARRAYIDX_32]] to <16 x i32>*
	; O3DEFAULT-NEXT: [[TMP35:%.]] = load <4 x i32>, <4 x i32> [[TMP34]], align 4			; O3DEFAULT-NEXT: [[TMP7:%.]] = load <16 x i32>, <16 x i32> [[TMP6]], align 4
	; O3DEFAULT-NEXT: [[TMP36:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP3]]			; O3DEFAULT-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> undef, i32 [[N]], i32 0
	; O3DEFAULT-NEXT: [[TMP37:%.]] = bitcast i32 [[ARRAYIDX2_32]] to <4 x i32>*			; O3DEFAULT-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> undef, <16 x i32> zeroinitializer
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP36]], <4 x i32>* [[TMP37]], align 4			; O3DEFAULT-NEXT: [[TMP10:%.*]] = add nsw <16 x i32> [[TMP7]], [[TMP9]]
	; O3DEFAULT-NEXT: [[ARRAYIDX_36:%.]] = getelementptr inbounds i32, i32 [[B]], i64 36			; O3DEFAULT-NEXT: [[TMP11:%.]] = bitcast i32 [[ARRAYIDX2_32]] to <16 x i32>*
	; O3DEFAULT-NEXT: [[ARRAYIDX2_36:%.]] = getelementptr inbounds i32, i32 [[A]], i64 36			; O3DEFAULT-NEXT: store <16 x i32> [[TMP10]], <16 x i32>* [[TMP11]], align 4
	; O3DEFAULT-NEXT: [[TMP38:%.]] = bitcast i32 [[ARRAYIDX_36]] to <4 x i32>*			; O3DEFAULT-NEXT: [[TMP12:%.]] = load i32, i32 [[A]], align 4
	; O3DEFAULT-NEXT: [[TMP39:%.]] = load <4 x i32>, <4 x i32> [[TMP38]], align 4			; O3DEFAULT-NEXT: ret i32 [[TMP12]]
	; O3DEFAULT-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP41:%.]] = bitcast i32 [[ARRAYIDX2_36]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP41]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_40:%.]] = getelementptr inbounds i32, i32 [[B]], i64 40
	; O3DEFAULT-NEXT: [[ARRAYIDX2_40:%.]] = getelementptr inbounds i32, i32 [[A]], i64 40
	; O3DEFAULT-NEXT: [[TMP42:%.]] = bitcast i32 [[ARRAYIDX_40]] to <4 x i32>*
	; O3DEFAULT-NEXT: [[TMP43:%.]] = load <4 x i32>, <4 x i32> [[TMP42]], align 4
	; O3DEFAULT-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[TMP43]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP45:%.]] = bitcast i32 [[ARRAYIDX2_40]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP44]], <4 x i32>* [[TMP45]], align 4
	; O3DEFAULT-NEXT: [[ARRAYIDX_44:%.]] = getelementptr inbounds i32, i32 [[B]], i64 44
	; O3DEFAULT-NEXT: [[ARRAYIDX2_44:%.]] = getelementptr inbounds i32, i32 [[A]], i64 44
	; O3DEFAULT-NEXT: [[TMP46:%.]] = bitcast i32 [[ARRAYIDX_44]] to <4 x i32>*
	; O3DEFAULT-NEXT: [[TMP47:%.]] = load <4 x i32>, <4 x i32> [[TMP46]], align 4
	; O3DEFAULT-NEXT: [[TMP48:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP3]]
	; O3DEFAULT-NEXT: [[TMP49:%.]] = bitcast i32 [[ARRAYIDX2_44]] to <4 x i32>*
	; O3DEFAULT-NEXT: store <4 x i32> [[TMP48]], <4 x i32>* [[TMP49]], align 4
	; O3DEFAULT-NEXT: [[TMP50:%.]] = load i32, i32 [[A]], align 4
	; O3DEFAULT-NEXT: ret i32 [[TMP50]]
	;			;
	; Os-LABEL: @disabled(			; Os-LABEL: @disabled(
	; Os-NEXT: entry:			; Os-NEXT: entry:
	; Os-NEXT: br label [[FOR_BODY:%.*]]			; Os-NEXT: br label [[FOR_BODY:%.*]]
	; Os: for.body:			; Os: for.body:
	; Os-NEXT: [[INDVARS_IV:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]			; Os-NEXT: [[INDVARS_IV:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
	; Os-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i32, i32 [[B:%.*]], i64 [[INDVARS_IV]]			; Os-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds i32, i32 [[B:%.*]], i64 [[INDVARS_IV]]
	; Os-NEXT: [[TMP0:%.]] = load i32, i32 [[ARRAYIDX]], align 4			; Os-NEXT: [[TMP0:%.]] = load i32, i32 [[ARRAYIDX]], align 4
	▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/AArch64/matmul.ll

	Show All 11 Lines
	; CHECK-LABEL: @wrap_mul4(			; CHECK-LABEL: @wrap_mul4(
	; CHECK-NEXT: [[ARRAYIDX1_I:%.]] = getelementptr inbounds [2 x double], [2 x double] [[A:%.*]], i64 0, i64 0			; CHECK-NEXT: [[ARRAYIDX1_I:%.]] = getelementptr inbounds [2 x double], [2 x double] [[A:%.*]], i64 0, i64 0
	; CHECK-NEXT: [[TEMP:%.]] = load double, double [[ARRAYIDX1_I]], align 8			; CHECK-NEXT: [[TEMP:%.]] = load double, double [[ARRAYIDX1_I]], align 8
	; CHECK-NEXT: [[ARRAYIDX3_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B:%.*]], i64 0, i64 0			; CHECK-NEXT: [[ARRAYIDX3_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B:%.*]], i64 0, i64 0
	; CHECK-NEXT: [[ARRAYIDX5_I:%.]] = getelementptr inbounds [2 x double], [2 x double] [[A]], i64 0, i64 1			; CHECK-NEXT: [[ARRAYIDX5_I:%.]] = getelementptr inbounds [2 x double], [2 x double] [[A]], i64 0, i64 1
	; CHECK-NEXT: [[TEMP2:%.]] = load double, double [[ARRAYIDX5_I]], align 8			; CHECK-NEXT: [[TEMP2:%.]] = load double, double [[ARRAYIDX5_I]], align 8
	; CHECK-NEXT: [[ARRAYIDX7_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 1, i64 0			; CHECK-NEXT: [[ARRAYIDX7_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 1, i64 0
	; CHECK-NEXT: [[ARRAYIDX13_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 0, i64 1			; CHECK-NEXT: [[ARRAYIDX13_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 0, i64 1
	; CHECK-NEXT: [[TMP1:%.]] = bitcast double [[ARRAYIDX3_I]] to <2 x double>*
	; CHECK-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> [[TMP1]], align 8
	; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TEMP]], i32 0
	; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1
	; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]]
	; CHECK-NEXT: [[ARRAYIDX18_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 1, i64 1			; CHECK-NEXT: [[ARRAYIDX18_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 1, i64 1
	; CHECK-NEXT: [[TMP6:%.]] = bitcast double [[ARRAYIDX7_I]] to <2 x double>*
	; CHECK-NEXT: [[TMP7:%.]] = load <2 x double>, <2 x double> [[TMP6]], align 8
	; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> undef, double [[TEMP2]], i32 0
	; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1
	; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]]
	; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]]
	; CHECK-NEXT: [[ARRAYIDX25_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 0, i64 2			; CHECK-NEXT: [[ARRAYIDX25_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 0, i64 2
	; CHECK-NEXT: [[ARRAYIDX30_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 1, i64 2			; CHECK-NEXT: [[ARRAYIDX30_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 1, i64 2
	; CHECK-NEXT: [[ARRAYIDX37_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 0, i64 3			; CHECK-NEXT: [[ARRAYIDX37_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 0, i64 3
	; CHECK-NEXT: [[TMP12:%.]] = bitcast double [[ARRAYIDX25_I]] to <2 x double>*			; CHECK-NEXT: [[TMP1:%.]] = bitcast double [[ARRAYIDX3_I]] to <4 x double>*
	; CHECK-NEXT: [[TMP13:%.]] = load <2 x double>, <2 x double> [[TMP12]], align 8			; CHECK-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> [[TMP1]], align 8
	; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP4]], [[TMP13]]			; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> undef, double [[TEMP]], i32 0
				; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[TEMP]], i32 1
				; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[TEMP]], i32 2
				; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[TEMP]], i32 3
				; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP2]]
	; CHECK-NEXT: [[ARRAYIDX42_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 1, i64 3			; CHECK-NEXT: [[ARRAYIDX42_I:%.]] = getelementptr inbounds [4 x double], [4 x double] [[B]], i64 1, i64 3
	; CHECK-NEXT: [[TMP15:%.]] = bitcast double [[ARRAYIDX30_I]] to <2 x double>*			; CHECK-NEXT: [[TMP8:%.]] = bitcast double [[ARRAYIDX7_I]] to <4 x double>*
	; CHECK-NEXT: [[TMP16:%.]] = load <2 x double>, <2 x double> [[TMP15]], align 8			; CHECK-NEXT: [[TMP9:%.]] = load <4 x double>, <4 x double> [[TMP8]], align 8
	; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x double> [[TMP9]], [[TMP16]]			; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> undef, double [[TEMP2]], i32 0
	; CHECK-NEXT: [[TMP18:%.*]] = fadd <2 x double> [[TMP14]], [[TMP17]]			; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[TEMP2]], i32 1
				; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TEMP2]], i32 2
				; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> [[TMP12]], double [[TEMP2]], i32 3
				; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x double> [[TMP13]], [[TMP9]]
				; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x double> [[TMP7]], [[TMP14]]
	; CHECK-NEXT: [[ARRAYIDX47_I:%.]] = getelementptr inbounds [2 x double], [2 x double] [[A]], i64 1, i64 0			; CHECK-NEXT: [[ARRAYIDX47_I:%.]] = getelementptr inbounds [2 x double], [2 x double] [[A]], i64 1, i64 0
	; CHECK-NEXT: [[TEMP10:%.]] = load double, double [[ARRAYIDX47_I]], align 8			; CHECK-NEXT: [[TEMP10:%.]] = load double, double [[ARRAYIDX47_I]], align 8
	; CHECK-NEXT: [[ARRAYIDX52_I:%.]] = getelementptr inbounds [2 x double], [2 x double] [[A]], i64 1, i64 1			; CHECK-NEXT: [[ARRAYIDX52_I:%.]] = getelementptr inbounds [2 x double], [2 x double] [[A]], i64 1, i64 1
	; CHECK-NEXT: [[TEMP11:%.]] = load double, double [[ARRAYIDX52_I]], align 8			; CHECK-NEXT: [[TEMP11:%.]] = load double, double [[ARRAYIDX52_I]], align 8
	; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> undef, double [[TEMP10]], i32 0			; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x double> undef, double [[TEMP10]], i32 0
	; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TEMP10]], i32 1			; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x double> [[TMP16]], double [[TEMP10]], i32 1
	; CHECK-NEXT: [[TMP21:%.*]] = fmul <2 x double> [[TMP2]], [[TMP20]]			; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x double> [[TMP17]], double [[TEMP10]], i32 2
	; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> undef, double [[TEMP11]], i32 0			; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x double> [[TMP18]], double [[TEMP10]], i32 3
	; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x double> [[TMP22]], double [[TEMP11]], i32 1			; CHECK-NEXT: [[TMP20:%.*]] = fmul <4 x double> [[TMP2]], [[TMP19]]
	; CHECK-NEXT: [[TMP24:%.*]] = fmul <2 x double> [[TMP7]], [[TMP23]]			; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x double> undef, double [[TEMP11]], i32 0
	; CHECK-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[TMP21]], [[TMP24]]			; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x double> [[TMP21]], double [[TEMP11]], i32 1
	; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP13]], [[TMP20]]			; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x double> [[TMP22]], double [[TEMP11]], i32 2
	; CHECK-NEXT: [[TMP27:%.*]] = fmul <2 x double> [[TMP16]], [[TMP23]]			; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x double> [[TMP23]], double [[TEMP11]], i32 3
	; CHECK-NEXT: [[TMP28:%.*]] = fadd <2 x double> [[TMP26]], [[TMP27]]			; CHECK-NEXT: [[TMP25:%.*]] = fmul <4 x double> [[TMP9]], [[TMP24]]
				; CHECK-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[TMP20]], [[TMP25]]
	; CHECK-NEXT: [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.]] = getelementptr inbounds double, double [[OUT:%.*]], i64 1			; CHECK-NEXT: [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.]] = getelementptr inbounds double, double [[OUT:%.*]], i64 1
	; CHECK-NEXT: [[TMP29:%.]] = bitcast double [[OUT]] to <2 x double>*
	; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP29]], align 8
	; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.]] = getelementptr inbounds double, double [[OUT]], i64 2			; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.]] = getelementptr inbounds double, double [[OUT]], i64 2
	; CHECK-NEXT: [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.]] = getelementptr inbounds double, double [[OUT]], i64 3			; CHECK-NEXT: [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.]] = getelementptr inbounds double, double [[OUT]], i64 3
	; CHECK-NEXT: [[TMP30:%.]] = bitcast double [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>*			; CHECK-NEXT: [[TMP27:%.]] = bitcast double [[OUT]] to <4 x double>*
	; CHECK-NEXT: store <2 x double> [[TMP18]], <2 x double>* [[TMP30]], align 8			; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP27]], align 8
	; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.]] = getelementptr inbounds double, double [[OUT]], i64 4			; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.]] = getelementptr inbounds double, double [[OUT]], i64 4
	; CHECK-NEXT: [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.]] = getelementptr inbounds double, double [[OUT]], i64 5			; CHECK-NEXT: [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.]] = getelementptr inbounds double, double [[OUT]], i64 5
	; CHECK-NEXT: [[TMP31:%.]] = bitcast double [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>*
	; CHECK-NEXT: store <2 x double> [[TMP25]], <2 x double>* [[TMP31]], align 8
	; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.]] = getelementptr inbounds double, double [[OUT]], i64 6			; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.]] = getelementptr inbounds double, double [[OUT]], i64 6
	; CHECK-NEXT: [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.]] = getelementptr inbounds double, double [[OUT]], i64 7			; CHECK-NEXT: [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.]] = getelementptr inbounds double, double [[OUT]], i64 7
	; CHECK-NEXT: [[TMP32:%.]] = bitcast double [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>*			; CHECK-NEXT: [[TMP28:%.]] = bitcast double [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <4 x double>*
	; CHECK-NEXT: store <2 x double> [[TMP28]], <2 x double>* [[TMP32]], align 8			; CHECK-NEXT: store <4 x double> [[TMP26]], <4 x double>* [[TMP28]], align 8
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%arrayidx1.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 0			%arrayidx1.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 0
	%temp = load double, double* %arrayidx1.i, align 8			%temp = load double, double* %arrayidx1.i, align 8
	%arrayidx3.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 0			%arrayidx3.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 0
	%temp1 = load double, double* %arrayidx3.i, align 8			%temp1 = load double, double* %arrayidx3.i, align 8
	%mul.i = fmul double %temp, %temp1			%mul.i = fmul double %temp, %temp1
	%arrayidx5.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 1			%arrayidx5.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 1
	▲ Show 20 Lines • Show All 60 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll

Show First 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8		; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; SLM-LABEL: @add_v8i64(		; SLM-LABEL: @add_v8i64(
; SLM-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		; SLM-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; SLM-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		; SLM-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; SLM-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		; SLM-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; SLM-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		; SLM-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
		RKSimonUnsubmitted Not Done Reply Inline Actions This change is surprising, given how slow v2i64 ADD/SUB are on SLM RKSimon: This change is surprising, given how slow v2i64 ADD/SUB are on SLM
		ABataevAuthorUnsubmitted Done Reply Inline Actions I believe it is the problem of the cost model. Most probably, it will be changed with D42981. ABataev: I believe it is the problem of the cost model. Most probably, it will be changed with D42981.
; SLM-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
; SLM-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
; SLM-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
; SLM-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
; SLM-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
; SLM-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
; SLM-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
; SLM-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
; SLM-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
; SLM-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
; SLM-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
; SLM-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; SLM-NEXT: ret void		; SLM-NEXT: ret void
;		;
; AVX1-LABEL: @add_v8i64(		; AVX1-LABEL: @add_v8i64(
; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])		; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])		; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])		; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])		; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @add_v8i64(		; AVX2-LABEL: @add_v8i64(
; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @add_v8i64(		; AVX512-LABEL: @add_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])		; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; AVX256BW-LABEL: @add_v8i64(		; AVX256BW-LABEL: @add_v8i64(
; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX256BW-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX256BW-NEXT: ret void		; AVX256BW-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
Show All 22 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @add_v16i32() {		define void @add_v16i32() {
; SSE-LABEL: @add_v16i32(		; CHECK-LABEL: @add_v16i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4		; CHECK-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v16i32(
; SLM-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v16i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: ret void
;
; AVX512-LABEL: @add_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	;
store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
ret void		ret void
}		}

define void @add_v32i16() {		define void @add_v32i16() {
; SSE-LABEL: @add_v32i16(		; CHECK-LABEL: @add_v32i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2		; CHECK-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v32i16(
; SLM-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; AVX512-LABEL: @add_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
; AVX512-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2		%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @add_v64i8() {		define void @add_v64i8() {
; CHECK-LABEL: @add_v64i8(		; CHECK-LABEL: @add_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/arith-add-usat.ll

Show First 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8		; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; SLM-LABEL: @add_v8i64(		; SLM-LABEL: @add_v8i64(
; SLM-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; SLM-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; SLM-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; SLM-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; SLM-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; SLM-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; SLM-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; SLM-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; SLM-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: ret void		; SLM-NEXT: ret void
;		;
; AVX-LABEL: @add_v8i64(		; AVX-LABEL: @add_v8i64(
; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX-NEXT: ret void		; AVX-NEXT: ret void
;		;
; AVX512-LABEL: @add_v8i64(		; AVX512-LABEL: @add_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])		; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
Show All 29 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @add_v16i32() {		define void @add_v16i32() {
; SSE-LABEL: @add_v16i32(		; CHECK-LABEL: @add_v16i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4		; CHECK-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v16i32(
; SLM-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v16i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: ret void
;
; AVX512-LABEL: @add_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	;
store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
ret void		ret void
}		}

define void @add_v32i16() {		define void @add_v32i16() {
; SSE-LABEL: @add_v32i16(		; CHECK-LABEL: @add_v32i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2		; CHECK-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v32i16(
; SLM-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; AVX512-LABEL: @add_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
; AVX512-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2		%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @add_v64i8() {		define void @add_v64i8() {
; CHECK-LABEL: @add_v64i8(		; CHECK-LABEL: @add_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/arith-add.ll

Show All 17 Lines
@a16 = common global [32 x i16] zeroinitializer, align 64		@a16 = common global [32 x i16] zeroinitializer, align 64
@b16 = common global [32 x i16] zeroinitializer, align 64		@b16 = common global [32 x i16] zeroinitializer, align 64
@c16 = common global [32 x i16] zeroinitializer, align 64		@c16 = common global [32 x i16] zeroinitializer, align 64
@a8 = common global [64 x i8] zeroinitializer, align 64		@a8 = common global [64 x i8] zeroinitializer, align 64
@b8 = common global [64 x i8] zeroinitializer, align 64		@b8 = common global [64 x i8] zeroinitializer, align 64
@c8 = common global [64 x i8] zeroinitializer, align 64		@c8 = common global [64 x i8] zeroinitializer, align 64

define void @add_v8i64() {		define void @add_v8i64() {
; SSE-LABEL: @add_v8i64(		; CHECK-LABEL: @add_v8i64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; CHECK-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i64> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; CHECK-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v8i64(
; SLM-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v8i64(
; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
; AVX-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX-NEXT: ret void
;
; AVX512-LABEL: @add_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i64> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8		%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
Show All 21 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @add_v16i32() {		define void @add_v16i32() {
; SSE-LABEL: @add_v16i32(		; CHECK-LABEL: @add_v16i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4		; CHECK-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v16i32(
; SLM-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v16i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: ret void
;
; AVX512-LABEL: @add_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	;
store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
ret void		ret void
}		}

define void @add_v32i16() {		define void @add_v32i16() {
; SSE-LABEL: @add_v32i16(		; CHECK-LABEL: @add_v32i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP3:%.*]] = add <32 x i16> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2		; CHECK-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v32i16(
; SLM-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; AVX512-LABEL: @add_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
; AVX512-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2		%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @add_v64i8() {		define void @add_v64i8() {
; CHECK-LABEL: @add_v64i8(		; CHECK-LABEL: @add_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = add <64 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]]
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/arith-fix.ll

Show First 20 Lines • Show All 78 Lines • ▼ Show 20 Lines
; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)		; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @smul_v8i64(		; AVX2-LABEL: @smul_v8i64(
; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @smul_v8i64(		; AVX512-LABEL: @smul_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)		; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; AVX256BW-LABEL: @smul_v8i64(		; AVX256BW-LABEL: @smul_v8i64(
; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX256BW-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX256BW-NEXT: ret void		; AVX256BW-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
▲ Show 20 Lines • Show All 224 Lines • ▼ Show 20 Lines
; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4		; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @smul_v16i32(		; AVX2-LABEL: @smul_v16i32(
; AVX2-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4		; AVX2-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX2-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX2-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX2-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4		; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
; AVX2-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX2-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX2-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
; AVX2-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
; AVX2-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX2-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @smul_v16i32(		; AVX512-LABEL: @smul_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4		; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4		; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)		; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4		; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; AVX256BW-LABEL: @smul_v16i32(		; AVX256BW-LABEL: @smul_v16i32(
; AVX256BW-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4		; AVX256BW-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX256BW-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX256BW-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX256BW-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4		; AVX256BW-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
; AVX256BW-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX256BW-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX256BW-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
; AVX256BW-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
; AVX256BW-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX256BW-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX256BW-NEXT: ret void		; AVX256BW-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	;
store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
ret void		ret void
}		}

define void @smul_v32i16() {		define void @smul_v32i16() {
; SSE-LABEL: @smul_v32i16(		; CHECK-LABEL: @smul_v32i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], i32 3)
; SSE-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2		; CHECK-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @smul_v32i16(
; SLM-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @smul_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; AVX512-LABEL: @smul_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
; AVX512-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2		%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @smul_v64i8() {		define void @smul_v64i8() {
; CHECK-LABEL: @smul_v64i8(		; CHECK-LABEL: @smul_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], i32 3)
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 309 Lines • ▼ Show 20 Lines
; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)		; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @umul_v8i64(		; AVX2-LABEL: @umul_v8i64(
; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @umul_v8i64(		; AVX512-LABEL: @umul_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)		; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; AVX256BW-LABEL: @umul_v8i64(		; AVX256BW-LABEL: @umul_v8i64(
; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX256BW-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX256BW-NEXT: ret void		; AVX256BW-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
▲ Show 20 Lines • Show All 224 Lines • ▼ Show 20 Lines
; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4		; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @umul_v16i32(		; AVX2-LABEL: @umul_v16i32(
; AVX2-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4		; AVX2-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX2-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX2-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX2-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4		; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
; AVX2-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX2-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX2-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
; AVX2-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
; AVX2-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX2-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @umul_v16i32(		; AVX512-LABEL: @umul_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4		; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4		; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)		; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4		; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; AVX256BW-LABEL: @umul_v16i32(		; AVX256BW-LABEL: @umul_v16i32(
; AVX256BW-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4		; AVX256BW-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX256BW-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX256BW-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX256BW-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4		; AVX256BW-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
; AVX256BW-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX256BW-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX256BW-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
; AVX256BW-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
; AVX256BW-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX256BW-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX256BW-NEXT: ret void		; AVX256BW-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	;
store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
ret void		ret void
}		}

define void @umul_v32i16() {		define void @umul_v32i16() {
; SSE-LABEL: @umul_v32i16(		; CHECK-LABEL: @umul_v32i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], i32 3)
; SSE-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2		; CHECK-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @umul_v32i16(
; SLM-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @umul_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; AVX512-LABEL: @umul_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
; AVX512-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2		%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @umul_v64i8() {		define void @umul_v64i8() {
; CHECK-LABEL: @umul_v64i8(		; CHECK-LABEL: @umul_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], i32 3)
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/arith-mul.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE		; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SLM		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=-prefer-128-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=-prefer-128-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=+prefer-128-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=+prefer-128-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=-prefer-128-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=-prefer-128-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=+prefer-128-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=SSE		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=+prefer-128-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2		; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2

@a64 = common global [8 x i64] zeroinitializer, align 64		@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64		@b64 = common global [8 x i64] zeroinitializer, align 64
@c64 = common global [8 x i64] zeroinitializer, align 64		@c64 = common global [8 x i64] zeroinitializer, align 64
@a32 = common global [16 x i32] zeroinitializer, align 64		@a32 = common global [16 x i32] zeroinitializer, align 64
▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines
; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8		; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @mul_v8i64(		; AVX2-LABEL: @mul_v8i64(
; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP3:%.*]] = mul <8 x i64> [[TMP1]], [[TMP2]]
; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]]
; AVX2-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]]
; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @mul_v8i64(		; AVX512-LABEL: @mul_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = mul <8 x i64> [[TMP1]], [[TMP2]]		; AVX512-NEXT: [[TMP3:%.*]] = mul <8 x i64> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
Show All 29 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @mul_v16i32() {		define void @mul_v16i32() {
; SSE-LABEL: @mul_v16i32(		; CHECK-LABEL: @mul_v16i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = mul <16 x i32> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4		; CHECK-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @mul_v16i32(
; SLM-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @mul_v16i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: ret void
;
; AVX512-LABEL: @mul_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = mul <16 x i32> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	;
store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
ret void		ret void
}		}

define void @mul_v32i16() {		define void @mul_v32i16() {
; SSE-LABEL: @mul_v32i16(		; CHECK-LABEL: @mul_v32i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP3:%.*]] = mul <32 x i16> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2		; CHECK-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @mul_v32i16(
; SLM-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @mul_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; AVX512-LABEL: @mul_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
; AVX512-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2		%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @mul_v64i8() {		define void @mul_v64i8() {
; CHECK-LABEL: @mul_v64i8(		; CHECK-LABEL: @mul_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = mul <64 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll

Show First 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8		; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; SLM-LABEL: @sub_v8i64(		; SLM-LABEL: @sub_v8i64(
; SLM-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		; SLM-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; SLM-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		; SLM-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; SLM-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		; SLM-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; SLM-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		; SLM-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; SLM-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
; SLM-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
; SLM-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
; SLM-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
; SLM-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
; SLM-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
; SLM-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
; SLM-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
; SLM-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
; SLM-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
; SLM-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
; SLM-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; SLM-NEXT: ret void		; SLM-NEXT: ret void
;		;
; AVX1-LABEL: @sub_v8i64(		; AVX1-LABEL: @sub_v8i64(
; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])		; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])		; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])		; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])		; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @sub_v8i64(		; AVX2-LABEL: @sub_v8i64(
; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @sub_v8i64(		; AVX512-LABEL: @sub_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])		; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; AVX256BW-LABEL: @sub_v8i64(		; AVX256BW-LABEL: @sub_v8i64(
; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX256BW-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX256BW-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX256BW-NEXT: ret void		; AVX256BW-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
Show All 22 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @sub_v16i32() {		define void @sub_v16i32() {
; SSE-LABEL: @sub_v16i32(		; CHECK-LABEL: @sub_v16i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4		; CHECK-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v16i32(
; SLM-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v16i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: ret void
;
; AVX512-LABEL: @sub_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	;
store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
ret void		ret void
}		}

define void @sub_v32i16() {		define void @sub_v32i16() {
; SSE-LABEL: @sub_v32i16(		; CHECK-LABEL: @sub_v32i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2		; CHECK-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v32i16(
; SLM-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; AVX512-LABEL: @sub_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
; AVX512-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2		%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @sub_v64i8() {		define void @sub_v64i8() {
; CHECK-LABEL: @sub_v64i8(		; CHECK-LABEL: @sub_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

Show First 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8		; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; SLM-LABEL: @sub_v8i64(		; SLM-LABEL: @sub_v8i64(
; SLM-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; SLM-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; SLM-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; SLM-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; SLM-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; SLM-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; SLM-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; SLM-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; SLM-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: ret void		; SLM-NEXT: ret void
;		;
; AVX-LABEL: @sub_v8i64(		; AVX-LABEL: @sub_v8i64(
; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX-NEXT: ret void		; AVX-NEXT: ret void
;		;
; AVX512-LABEL: @sub_v8i64(		; AVX512-LABEL: @sub_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])		; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
Show All 29 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @sub_v16i32() {		define void @sub_v16i32() {
; SSE-LABEL: @sub_v16i32(		; CHECK-LABEL: @sub_v16i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4		; CHECK-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v16i32(
; SLM-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v16i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: ret void
;
; AVX512-LABEL: @sub_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	;
store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
ret void		ret void
}		}

define void @sub_v32i16() {		define void @sub_v32i16() {
; SSE-LABEL: @sub_v32i16(		; CHECK-LABEL: @sub_v32i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2		; CHECK-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v32i16(
; SLM-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; AVX512-LABEL: @sub_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
; AVX512-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2		%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @sub_v64i8() {		define void @sub_v64i8() {
; CHECK-LABEL: @sub_v64i8(		; CHECK-LABEL: @sub_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/arith-sub.ll

Show All 17 Lines
@a16 = common global [32 x i16] zeroinitializer, align 64		@a16 = common global [32 x i16] zeroinitializer, align 64
@b16 = common global [32 x i16] zeroinitializer, align 64		@b16 = common global [32 x i16] zeroinitializer, align 64
@c16 = common global [32 x i16] zeroinitializer, align 64		@c16 = common global [32 x i16] zeroinitializer, align 64
@a8 = common global [64 x i8] zeroinitializer, align 64		@a8 = common global [64 x i8] zeroinitializer, align 64
@b8 = common global [64 x i8] zeroinitializer, align 64		@b8 = common global [64 x i8] zeroinitializer, align 64
@c8 = common global [64 x i8] zeroinitializer, align 64		@c8 = common global [64 x i8] zeroinitializer, align 64

define void @sub_v8i64() {		define void @sub_v8i64() {
; SSE-LABEL: @sub_v8i64(		; CHECK-LABEL: @sub_v8i64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; CHECK-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i64> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; CHECK-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v8i64(
; SLM-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v8i64(
; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
; AVX-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
; AVX-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX-NEXT: [[TMP5:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = sub <4 x i64> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX-NEXT: ret void
;
; AVX512-LABEL: @sub_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = sub <8 x i64> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8		%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
Show All 21 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @sub_v16i32() {		define void @sub_v16i32() {
; SSE-LABEL: @sub_v16i32(		; CHECK-LABEL: @sub_v16i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = sub <16 x i32> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4		; CHECK-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v16i32(
; SLM-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v16i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP5:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: ret void
;
; AVX512-LABEL: @sub_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = sub <16 x i32> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	;
store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
ret void		ret void
}		}

define void @sub_v32i16() {		define void @sub_v32i16() {
; SSE-LABEL: @sub_v32i16(		; CHECK-LABEL: @sub_v32i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP3:%.*]] = sub <32 x i16> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2		; CHECK-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; SSE-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v32i16(
; SLM-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; AVX512-LABEL: @sub_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
; AVX512-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2		%a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @sub_v64i8() {		define void @sub_v64i8() {
; CHECK-LABEL: @sub_v64i8(		; CHECK-LABEL: @sub_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = sub <64 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]]
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/bitreverse.ll

Show All 32 Lines	;
%bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)		%bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)
%bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)		%bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)
store i64 %bitreverse0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8		store i64 %bitreverse0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
store i64 %bitreverse1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8		store i64 %bitreverse1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
ret void		ret void
}		}

define void @bitreverse_4i64() #0 {		define void @bitreverse_4i64() #0 {
; SSE-LABEL: @bitreverse_4i64(		; CHECK-LABEL: @bitreverse_4i64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])		; CHECK-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @bitreverse_4i64(
; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
; AVX-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
; AVX-NEXT: ret void
;
; XOP-LABEL: @bitreverse_4i64(
; XOP-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
; XOP-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
; XOP-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
; XOP-NEXT: ret void
;		;
%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4		%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4		%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
%ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4		%ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
%ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4		%ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
%bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)		%bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)
%bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)		%bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)
%bitreverse2 = call i64 @llvm.bitreverse.i64(i64 %ld2)		%bitreverse2 = call i64 @llvm.bitreverse.i64(i64 %ld2)
Show All 23 Lines	;
store i32 %bitreverse0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4		store i32 %bitreverse0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
store i32 %bitreverse1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4		store i32 %bitreverse1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
store i32 %bitreverse2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4		store i32 %bitreverse2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
store i32 %bitreverse3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4		store i32 %bitreverse3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @bitreverse_8i32() #0 {		define void @bitreverse_8i32() #0 {
; SSE-LABEL: @bitreverse_8i32(		; CHECK-LABEL: @bitreverse_8i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])		; CHECK-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
; SSE-NEXT: ret void
;
; AVX-LABEL: @bitreverse_8i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> [[TMP1]])
; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
; AVX-NEXT: ret void
;
; XOP-LABEL: @bitreverse_8i32(
; XOP-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
; XOP-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> [[TMP1]])
; XOP-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
; XOP-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2		%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2		%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
%ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2		%ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
%ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2		%ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
%ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2		%ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
%ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2		%ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
%ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2		%ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	;
store i16 %bitreverse4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2		store i16 %bitreverse4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
store i16 %bitreverse5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2		store i16 %bitreverse5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
store i16 %bitreverse6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2		store i16 %bitreverse6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
store i16 %bitreverse7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2		store i16 %bitreverse7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
ret void		ret void
}		}

define void @bitreverse_16i16() #0 {		define void @bitreverse_16i16() #0 {
; SSE-LABEL: @bitreverse_16i16(		; CHECK-LABEL: @bitreverse_16i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])		; CHECK-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; AVX-LABEL: @bitreverse_16i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> [[TMP1]])
; AVX-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; AVX-NEXT: ret void
;
; XOP-LABEL: @bitreverse_16i16(
; XOP-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; XOP-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> [[TMP1]])
; XOP-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; XOP-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2		%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	;
store i8 %bitreverse13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1		store i8 %bitreverse13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
store i8 %bitreverse14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1		store i8 %bitreverse14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
store i8 %bitreverse15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1		store i8 %bitreverse15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
ret void		ret void
}		}

define void @bitreverse_32i8() #0 {		define void @bitreverse_32i8() #0 {
; CHECK-LABEL: @bitreverse_32i8(		; CHECK-LABEL: @bitreverse_32i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i8>, <32 x i8> bitcast ([32 x i8]* @src8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.*]] = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])		; CHECK-NEXT: store <32 x i8> [[TMP2]], <32 x i8>* bitcast ([32 x i8]* @dst8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
; CHECK-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1		%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1		%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1		%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
▲ Show 20 Lines • Show All 95 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/bswap.ll

Show First 20 Lines • Show All 95 Lines • ▼ Show 20 Lines	;
store i32 %bswap0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4		store i32 %bswap0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
store i32 %bswap1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4		store i32 %bswap1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
store i32 %bswap2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4		store i32 %bswap2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
store i32 %bswap3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4		store i32 %bswap3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @bswap_8i32() #0 {		define void @bswap_8i32() #0 {
; SSE-LABEL: @bswap_8i32(		; CHECK-LABEL: @bswap_8i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]])		; CHECK-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
; SSE-NEXT: ret void
;
; AVX-LABEL: @bswap_8i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> [[TMP1]])
; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
; AVX-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2		%ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2		%ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
%ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2		%ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
%ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2		%ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
%ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2		%ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
%ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2		%ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
%ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2		%ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	;
store i16 %bswap4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2		store i16 %bswap4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
store i16 %bswap5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2		store i16 %bswap5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
store i16 %bswap6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2		store i16 %bswap6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
store i16 %bswap7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2		store i16 %bswap7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
ret void		ret void
}		}

define void @bswap_16i16() #0 {		define void @bswap_16i16() #0 {
; SSE-LABEL: @bswap_16i16(		; CHECK-LABEL: @bswap_16i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]])		; CHECK-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; AVX-LABEL: @bswap_16i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> [[TMP1]])
; AVX-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; AVX-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2		%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/cast.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S \| FileCheck %s --check-prefixes=CHECK,SSE42		; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S \| FileCheck %s --check-prefixes=CHECK,SSE42
; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S \| FileCheck %s --check-prefixes=CHECK,AVX		; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S \| FileCheck %s --check-prefixes=CHECK,AVX

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"		target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

; int test_sext_4i8_to_4i32(int * restrict A, char * restrict B) {		; int test_sext_4i8_to_4i32(int * restrict A, char * restrict B) {
; A[0] = B[0];		; A[0] = B[0];
; A[1] = B[1];		; A[1] = B[1];
; A[2] = B[2];		; A[2] = B[2];
; A[3] = B[3];		; A[3] = B[3];
; }		; }

define i32 @test_sext_4i8_to_4i32(i32* noalias nocapture %A, i8* noalias nocapture %B) {		define i32 @test_sext_4i8_to_4i32(i32* noalias nocapture %A, i8* noalias nocapture %B) {
		; SSE41-LABEL: @test_sext_4i8_to_4i32(
		; SSE41-NEXT: entry:
		; SSE41-NEXT: [[TMP0:%.]] = bitcast i8 [[B:%.]] to <4 x i8>
		; SSE41-NEXT: [[TMP1:%.]] = load <4 x i8>, <4 x i8> [[TMP0]], align 1
		; SSE41-NEXT: [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
		; SSE41-NEXT: [[TMP3:%.]] = bitcast i32 [[A:%.]] to <4 x i32>
		; SSE41-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
		; SSE41-NEXT: ret i32 undef
		RKSimonUnsubmitted Not Done Reply Inline Actions Please check this - it looks superfluous (and is under an unused prefix) RKSimon: Please check this - it looks superfluous (and is under an unused prefix)
		;
; CHECK-LABEL: @test_sext_4i8_to_4i32(		; CHECK-LABEL: @test_sext_4i8_to_4i32(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.]] = bitcast i8 [[B:%.]] to <4 x i8>		; CHECK-NEXT: [[TMP0:%.]] = bitcast i8 [[B:%.]] to <4 x i8>
; CHECK-NEXT: [[TMP1:%.]] = load <4 x i8>, <4 x i8> [[TMP0]], align 1		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i8>, <4 x i8> [[TMP0]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>		; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[A:%.]] to <4 x i32>		; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[A:%.]] to <4 x i32>
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4		; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret i32 undef		; CHECK-NEXT: ret i32 undef
Show All 16 Lines	entry:
%3 = load i8, i8* %arrayidx8, align 1		%3 = load i8, i8* %arrayidx8, align 1
%conv9 = sext i8 %3 to i32		%conv9 = sext i8 %3 to i32
%arrayidx10 = getelementptr inbounds i32, i32* %A, i64 3		%arrayidx10 = getelementptr inbounds i32, i32* %A, i64 3
store i32 %conv9, i32* %arrayidx10, align 4		store i32 %conv9, i32* %arrayidx10, align 4
ret i32 undef		ret i32 undef
}		}

define i32 @test_zext_4i16_to_4i32(i32* noalias nocapture %A, i16* noalias nocapture %B) {		define i32 @test_zext_4i16_to_4i32(i32* noalias nocapture %A, i16* noalias nocapture %B) {
		; SSE41-LABEL: @test_zext_4i16_to_4i32(
		; SSE41-NEXT: entry:
		; SSE41-NEXT: [[TMP0:%.]] = bitcast i16 [[B:%.]] to <4 x i16>
		; SSE41-NEXT: [[TMP1:%.]] = load <4 x i16>, <4 x i16> [[TMP0]], align 1
		; SSE41-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
		; SSE41-NEXT: [[TMP3:%.]] = bitcast i32 [[A:%.]] to <4 x i32>
		; SSE41-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
		; SSE41-NEXT: ret i32 undef
		;
; CHECK-LABEL: @test_zext_4i16_to_4i32(		; CHECK-LABEL: @test_zext_4i16_to_4i32(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.]] = bitcast i16 [[B:%.]] to <4 x i16>		; CHECK-NEXT: [[TMP0:%.]] = bitcast i16 [[B:%.]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.]] = load <4 x i16>, <4 x i16> [[TMP0]], align 1		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i16>, <4 x i16> [[TMP0]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>		; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[A:%.]] to <4 x i32>		; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[A:%.]] to <4 x i32>
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4		; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret i32 undef		; CHECK-NEXT: ret i32 undef
Show All 16 Lines	entry:
%3 = load i16, i16* %arrayidx8, align 1		%3 = load i16, i16* %arrayidx8, align 1
%conv9 = zext i16 %3 to i32		%conv9 = zext i16 %3 to i32
%arrayidx10 = getelementptr inbounds i32, i32* %A, i64 3		%arrayidx10 = getelementptr inbounds i32, i32* %A, i64 3
store i32 %conv9, i32* %arrayidx10, align 4		store i32 %conv9, i32* %arrayidx10, align 4
ret i32 undef		ret i32 undef
}		}

define i64 @test_sext_4i16_to_4i64(i64* noalias nocapture %A, i16* noalias nocapture %B) {		define i64 @test_sext_4i16_to_4i64(i64* noalias nocapture %A, i16* noalias nocapture %B) {
; SSE42-LABEL: @test_sext_4i16_to_4i64(		; CHECK-LABEL: @test_sext_4i16_to_4i64(
; SSE42-NEXT: entry:		; CHECK-NEXT: entry:
; SSE42-NEXT: [[TMP0:%.]] = bitcast i16 [[B:%.]] to <2 x i16>		; CHECK-NEXT: [[TMP0:%.]] = bitcast i16 [[B:%.]] to <4 x i16>
; SSE42-NEXT: [[TMP1:%.]] = load <2 x i16>, <2 x i16> [[TMP0]], align 1		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i16>, <4 x i16> [[TMP0]], align 1
; SSE42-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>		; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i64>
; SSE42-NEXT: [[TMP3:%.]] = bitcast i64 [[A:%.]] to <2 x i64>		; CHECK-NEXT: [[TMP3:%.]] = bitcast i64 [[A:%.]] to <4 x i64>
; SSE42-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4		; CHECK-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 4
; SSE42-NEXT: [[ARRAYIDX5:%.]] = getelementptr inbounds i16, i16 [[B]], i64 2		; CHECK-NEXT: ret i64 undef
; SSE42-NEXT: [[ARRAYIDX7:%.]] = getelementptr inbounds i64, i64 [[A]], i64 2
; SSE42-NEXT: [[TMP4:%.]] = bitcast i16 [[ARRAYIDX5]] to <2 x i16>*
; SSE42-NEXT: [[TMP5:%.]] = load <2 x i16>, <2 x i16> [[TMP4]], align 1
; SSE42-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
; SSE42-NEXT: [[TMP7:%.]] = bitcast i64 [[ARRAYIDX7]] to <2 x i64>*
; SSE42-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
; SSE42-NEXT: ret i64 undef
;
; AVX-LABEL: @test_sext_4i16_to_4i64(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.]] = bitcast i16 [[B:%.]] to <4 x i16>
; AVX-NEXT: [[TMP1:%.]] = load <4 x i16>, <4 x i16> [[TMP0]], align 1
; AVX-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i64>
; AVX-NEXT: [[TMP3:%.]] = bitcast i64 [[A:%.]] to <4 x i64>
; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 4
; AVX-NEXT: ret i64 undef
;		;
entry:		entry:
%0 = load i16, i16* %B, align 1		%0 = load i16, i16* %B, align 1
%conv = sext i16 %0 to i64		%conv = sext i16 %0 to i64
store i64 %conv, i64* %A, align 4		store i64 %conv, i64* %A, align 4
%arrayidx2 = getelementptr inbounds i16, i16* %B, i64 1		%arrayidx2 = getelementptr inbounds i16, i16* %B, i64 1
%1 = load i16, i16* %arrayidx2, align 1		%1 = load i16, i16* %arrayidx2, align 1
%conv3 = sext i16 %1 to i64		%conv3 = sext i16 %1 to i64
Show All 14 Lines

test/Transforms/SLPVectorizer/X86/ctlz.ll

Show First 20 Lines • Show All 222 Lines • ▼ Show 20 Lines	;
store i16 %ctlz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2		store i16 %ctlz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
store i16 %ctlz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2		store i16 %ctlz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
store i16 %ctlz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2		store i16 %ctlz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
store i16 %ctlz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2		store i16 %ctlz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
ret void		ret void
}		}

define void @ctlz_16i16() #0 {		define void @ctlz_16i16() #0 {
; SSE-LABEL: @ctlz_16i16(		; CHECK-LABEL: @ctlz_16i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> [[TMP1]], i1 false)
; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false)		; CHECK-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 false)		; CHECK-NEXT: ret void
; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; AVX-LABEL: @ctlz_16i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> [[TMP1]], i1 false)
; AVX-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; AVX-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2		%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	;
store i8 %ctlz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1		store i8 %ctlz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
store i8 %ctlz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1		store i8 %ctlz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
store i8 %ctlz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1		store i8 %ctlz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
ret void		ret void
}		}

define void @ctlz_32i8() #0 {		define void @ctlz_32i8() #0 {
; CHECK-LABEL: @ctlz_32i8(		; CHECK-LABEL: @ctlz_32i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i8>, <32 x i8> bitcast ([32 x i8]* @src8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.*]] = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> [[TMP1]], i1 false)
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false)		; CHECK-NEXT: store <32 x i8> [[TMP2]], <32 x i8>* bitcast ([32 x i8]* @dst8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 false)
; CHECK-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1		%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1		%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1		%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
▲ Show 20 Lines • Show All 293 Lines • ▼ Show 20 Lines	;
store i16 %ctlz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2		store i16 %ctlz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
store i16 %ctlz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2		store i16 %ctlz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
store i16 %ctlz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2		store i16 %ctlz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
store i16 %ctlz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2		store i16 %ctlz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
ret void		ret void
}		}

define void @ctlz_undef_16i16() #0 {		define void @ctlz_undef_16i16() #0 {
; SSE-LABEL: @ctlz_undef_16i16(		; CHECK-LABEL: @ctlz_undef_16i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> [[TMP1]], i1 true)
; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true)		; CHECK-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 true)		; CHECK-NEXT: ret void
; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; AVX-LABEL: @ctlz_undef_16i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> [[TMP1]], i1 true)
; AVX-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; AVX-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2		%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	;
store i8 %ctlz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1		store i8 %ctlz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
store i8 %ctlz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1		store i8 %ctlz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
store i8 %ctlz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1		store i8 %ctlz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
ret void		ret void
}		}

define void @ctlz_undef_32i8() #0 {		define void @ctlz_undef_32i8() #0 {
; CHECK-LABEL: @ctlz_undef_32i8(		; CHECK-LABEL: @ctlz_undef_32i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i8>, <32 x i8> bitcast ([32 x i8]* @src8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.*]] = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> [[TMP1]], i1 true)
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true)		; CHECK-NEXT: store <32 x i8> [[TMP2]], <32 x i8>* bitcast ([32 x i8]* @dst8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 true)
; CHECK-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1		%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1		%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1		%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
▲ Show 20 Lines • Show All 94 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/ctpop.ll

Show First 20 Lines • Show All 139 Lines • ▼ Show 20 Lines	;
store i32 %ctpop1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4		store i32 %ctpop1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
store i32 %ctpop2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4		store i32 %ctpop2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
store i32 %ctpop3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4		store i32 %ctpop3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @ctpop_8i32() #0 {		define void @ctpop_8i32() #0 {
; SSE2-LABEL: @ctpop_8i32(		; SSE2-LABEL: @ctpop_8i32(
; SSE2-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2		; SSE2-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
; SSE2-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2		; SSE2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> [[TMP1]])
; SSE2-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])		; SSE2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]])
; SSE2-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
; SSE2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
; SSE2-NEXT: ret void		; SSE2-NEXT: ret void
;		;
; SSE42-LABEL: @ctpop_8i32(		; SSE42-LABEL: @ctpop_8i32(
; SSE42-NEXT: [[LD0:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2		; SSE42-NEXT: [[LD0:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
; SSE42-NEXT: [[LD1:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2		; SSE42-NEXT: [[LD1:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
; SSE42-NEXT: [[LD2:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2		; SSE42-NEXT: [[LD2:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
; SSE42-NEXT: [[LD3:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2		; SSE42-NEXT: [[LD3:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
; SSE42-NEXT: [[LD4:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2		; SSE42-NEXT: [[LD4:%.]] = load i32, i32 getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines	;
store i16 %ctpop4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2		store i16 %ctpop4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
store i16 %ctpop5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2		store i16 %ctpop5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
store i16 %ctpop6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2		store i16 %ctpop6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
store i16 %ctpop7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2		store i16 %ctpop7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
ret void		ret void
}		}

define void @ctpop_16i16() #0 {		define void @ctpop_16i16() #0 {
; SSE-LABEL: @ctpop_16i16(		; CHECK-LABEL: @ctpop_16i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]])		; CHECK-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; AVX-LABEL: @ctpop_16i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> [[TMP1]])
; AVX-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; AVX-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2		%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	;
store i8 %ctpop13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1		store i8 %ctpop13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
store i8 %ctpop14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1		store i8 %ctpop14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
store i8 %ctpop15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1		store i8 %ctpop15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
ret void		ret void
}		}

define void @ctpop_32i8() #0 {		define void @ctpop_32i8() #0 {
; CHECK-LABEL: @ctpop_32i8(		; CHECK-LABEL: @ctpop_32i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i8>, <32 x i8> bitcast ([32 x i8]* @src8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.*]] = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]])		; CHECK-NEXT: store <32 x i8> [[TMP2]], <32 x i8>* bitcast ([32 x i8]* @dst8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP2]])
; CHECK-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1		%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1		%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1		%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
▲ Show 20 Lines • Show All 95 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/cttz.ll

Show First 20 Lines • Show All 222 Lines • ▼ Show 20 Lines	;
store i16 %cttz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2		store i16 %cttz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
store i16 %cttz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2		store i16 %cttz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
store i16 %cttz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2		store i16 %cttz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
store i16 %cttz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2		store i16 %cttz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
ret void		ret void
}		}

define void @cttz_16i16() #0 {		define void @cttz_16i16() #0 {
; SSE-LABEL: @cttz_16i16(		; CHECK-LABEL: @cttz_16i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> [[TMP1]], i1 false)
; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 false)		; CHECK-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 false)		; CHECK-NEXT: ret void
; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; AVX-LABEL: @cttz_16i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> [[TMP1]], i1 false)
; AVX-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; AVX-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2		%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	;
store i8 %cttz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1		store i8 %cttz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
store i8 %cttz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1		store i8 %cttz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
store i8 %cttz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1		store i8 %cttz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
ret void		ret void
}		}

define void @cttz_32i8() #0 {		define void @cttz_32i8() #0 {
; CHECK-LABEL: @cttz_32i8(		; CHECK-LABEL: @cttz_32i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i8>, <32 x i8> bitcast ([32 x i8]* @src8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.*]] = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> [[TMP1]], i1 false)
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 false)		; CHECK-NEXT: store <32 x i8> [[TMP2]], <32 x i8>* bitcast ([32 x i8]* @dst8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 false)
; CHECK-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1		%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1		%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1		%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
▲ Show 20 Lines • Show All 293 Lines • ▼ Show 20 Lines	;
store i16 %cttz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2		store i16 %cttz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
store i16 %cttz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2		store i16 %cttz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
store i16 %cttz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2		store i16 %cttz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
store i16 %cttz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2		store i16 %cttz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
ret void		ret void
}		}

define void @cttz_undef_16i16() #0 {		define void @cttz_undef_16i16() #0 {
; SSE-LABEL: @cttz_undef_16i16(		; CHECK-LABEL: @cttz_undef_16i16(
; SSE-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2		; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> [[TMP1]], i1 true)
; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true)		; CHECK-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 true)		; CHECK-NEXT: ret void
; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; AVX-LABEL: @cttz_undef_16i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> [[TMP1]], i1 true)
; AVX-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
; AVX-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2		%ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2		%ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2		%ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2		%ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	;
store i8 %cttz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1		store i8 %cttz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
store i8 %cttz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1		store i8 %cttz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
store i8 %cttz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1		store i8 %cttz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
ret void		ret void
}		}

define void @cttz_undef_32i8() #0 {		define void @cttz_undef_32i8() #0 {
; CHECK-LABEL: @cttz_undef_32i8(		; CHECK-LABEL: @cttz_undef_32i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <32 x i8>, <32 x i8> bitcast ([32 x i8]* @src8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.*]] = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> [[TMP1]], i1 true)
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true)		; CHECK-NEXT: store <32 x i8> [[TMP2]], <32 x i8>* bitcast ([32 x i8]* @dst8 to <32 x i8>*), align 1
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 true)
; CHECK-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1		%ld0 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1		%ld2 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1		%ld4 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
▲ Show 20 Lines • Show All 94 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/fabs.ll

Show All 31 Lines	;
%fabs0 = call double @llvm.fabs.f64(double %a0)		%fabs0 = call double @llvm.fabs.f64(double %a0)
%fabs1 = call double @llvm.fabs.f64(double %a1)		%fabs1 = call double @llvm.fabs.f64(double %a1)
store double %fabs0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8		store double %fabs0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
store double %fabs1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8		store double %fabs1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
ret void		ret void
}		}

define void @fabs_4f64() #0 {		define void @fabs_4f64() #0 {
; SSE-LABEL: @fabs_4f64(		; CHECK-LABEL: @fabs_4f64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8		; CHECK-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
; SSE-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8		; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])		; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
; SSE-NEXT: ret void
;
; AVX-LABEL: @fabs_4f64(
; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]])
; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
; AVX-NEXT: ret void
;		;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8		%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8		%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8		%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8		%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
%fabs0 = call double @llvm.fabs.f64(double %a0)		%fabs0 = call double @llvm.fabs.f64(double %a0)
%fabs1 = call double @llvm.fabs.f64(double %a1)		%fabs1 = call double @llvm.fabs.f64(double %a1)
%fabs2 = call double @llvm.fabs.f64(double %a2)		%fabs2 = call double @llvm.fabs.f64(double %a2)
%fabs3 = call double @llvm.fabs.f64(double %a3)		%fabs3 = call double @llvm.fabs.f64(double %a3)
store double %fabs0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8		store double %fabs0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
store double %fabs1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8		store double %fabs1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
store double %fabs2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8		store double %fabs2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
store double %fabs3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		store double %fabs3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
ret void		ret void
}		}

define void @fabs_8f64() #0 {		define void @fabs_8f64() #0 {
; SSE-LABEL: @fabs_8f64(		; CHECK-LABEL: @fabs_8f64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.fabs.v8f64(<8 x double> [[TMP1]])
; SSE-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4		; CHECK-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
; SSE-NEXT: [[TMP4:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]])
; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP4]])
; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @fabs_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 4
; AVX256-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
; AVX256-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]])
; AVX256-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP2]])
; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @fabs_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 4
; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.fabs.v8f64(<8 x double> [[TMP1]])
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 4		%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 4
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 4		%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 4
%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 4		%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 4
%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 4		%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 4
%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 4		%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 4
%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 4		%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 4
%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 4		%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 4
Show All 35 Lines	;
store float %fabs0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4		store float %fabs0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
store float %fabs1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4		store float %fabs1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
store float %fabs2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4		store float %fabs2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
store float %fabs3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4		store float %fabs3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @fabs_8f32() #0 {		define void @fabs_8f32() #0 {
; SSE-LABEL: @fabs_8f32(		; CHECK-LABEL: @fabs_8f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])		; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @fabs_8f32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]])
; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
; AVX-NEXT: ret void
;		;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4		%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4		%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4		%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4		%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4		%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4		%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4		%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
Show All 13 Lines	;
store float %fabs4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4		store float %fabs4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
store float %fabs5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4		store float %fabs5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
store float %fabs6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4		store float %fabs6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
store float %fabs7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4		store float %fabs7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
ret void		ret void
}		}

define void @fabs_16f32() #0 {		define void @fabs_16f32() #0 {
; SSE-LABEL: @fabs_16f32(		; CHECK-LABEL: @fabs_16f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.fabs.v16f32(<16 x float> [[TMP1]])
; SSE-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4		; CHECK-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
; SSE-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]])
; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]])
; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @fabs_16f32(
; AVX256-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; AVX256-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
; AVX256-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]])
; AVX256-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP2]])
; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @fabs_16f32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.fabs.v16f32(<16 x float> [[TMP1]])
; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4		%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4		%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4		%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4		%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4		%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4		%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4		%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/fcopysign.ll

Show All 38 Lines	;
%fcopysign0 = call double @llvm.copysign.f64(double %a0, double %b0)		%fcopysign0 = call double @llvm.copysign.f64(double %a0, double %b0)
%fcopysign1 = call double @llvm.copysign.f64(double %a1, double %b1)		%fcopysign1 = call double @llvm.copysign.f64(double %a1, double %b1)
store double %fcopysign0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8		store double %fcopysign0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
store double %fcopysign1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8		store double %fcopysign1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
ret void		ret void
}		}

define void @fcopysign_4f64() #0 {		define void @fcopysign_4f64() #0 {
; SSE-LABEL: @fcopysign_4f64(		; CHECK-LABEL: @fcopysign_4f64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8		; CHECK-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8
; SSE-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8		; CHECK-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8
; SSE-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8		; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8		; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]])		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]])
; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
; SSE-NEXT: ret void
;
; AVX-LABEL: @fcopysign_4f64(
; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8
; AVX-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8
; AVX-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
; AVX-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
; AVX-NEXT: ret void
;		;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8		%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8		%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8		%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8
%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8		%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8
%b0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8		%b0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
%b1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8		%b1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
%b2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8		%b2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8
%b3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8		%b3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8
%fcopysign0 = call double @llvm.copysign.f64(double %a0, double %b0)		%fcopysign0 = call double @llvm.copysign.f64(double %a0, double %b0)
%fcopysign1 = call double @llvm.copysign.f64(double %a1, double %b1)		%fcopysign1 = call double @llvm.copysign.f64(double %a1, double %b1)
%fcopysign2 = call double @llvm.copysign.f64(double %a2, double %b2)		%fcopysign2 = call double @llvm.copysign.f64(double %a2, double %b2)
%fcopysign3 = call double @llvm.copysign.f64(double %a3, double %b3)		%fcopysign3 = call double @llvm.copysign.f64(double %a3, double %b3)
store double %fcopysign0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8		store double %fcopysign0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
store double %fcopysign1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8		store double %fcopysign1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
store double %fcopysign2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8		store double %fcopysign2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
store double %fcopysign3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		store double %fcopysign3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
ret void		ret void
}		}

define void @fcopysign_8f64() #0 {		define void @fcopysign_8f64() #0 {
; SSE-LABEL: @fcopysign_8f64(		; CHECK-LABEL: @fcopysign_8f64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcA64 to <8 x double>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcB64 to <8 x double>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.copysign.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4		; CHECK-NEXT: store <8 x double> [[TMP3]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]])
; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
; SSE-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
; SSE-NEXT: store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
; SSE-NEXT: store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @fcopysign_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
; AVX256-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
; AVX256-NEXT: [[TMP3:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
; AVX256-NEXT: [[TMP4:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
; AVX256-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]])
; AVX256-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]])
; AVX256-NEXT: store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
; AVX256-NEXT: store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @fcopysign_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcA64 to <8 x double>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcB64 to <8 x double>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.copysign.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]])
; AVX512-NEXT: store <8 x double> [[TMP3]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4		%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4		%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4		%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4		%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4
%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4		%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4
%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4		%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4
%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4		%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	;
store float %fcopysign0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4		store float %fcopysign0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
store float %fcopysign1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4		store float %fcopysign1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
store float %fcopysign2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4		store float %fcopysign2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
store float %fcopysign3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4		store float %fcopysign3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @fcopysign_8f32() #0 {		define void @fcopysign_8f32() #0 {
; SSE-LABEL: @fcopysign_8f32(		; CHECK-LABEL: @fcopysign_8f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4		; CHECK-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]])
; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @fcopysign_8f32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
; AVX-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
; AVX-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
; AVX-NEXT: ret void
;		;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4		%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4		%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4		%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4		%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4		%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4		%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4		%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
Show All 21 Lines	;
store float %fcopysign4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4		store float %fcopysign4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
store float %fcopysign5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4		store float %fcopysign5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
store float %fcopysign6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4		store float %fcopysign6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
store float %fcopysign7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4		store float %fcopysign7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
ret void		ret void
}		}

define void @fcopysign_16f32() #0 {		define void @fcopysign_16f32() #0 {
; SSE-LABEL: @fcopysign_16f32(		; CHECK-LABEL: @fcopysign_16f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcA32 to <16 x float>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcB32 to <16 x float>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.copysign.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]])
; SSE-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4		; CHECK-NEXT: store <16 x float> [[TMP3]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]])
; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]])
; SSE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]])
; SSE-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]])
; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @fcopysign_16f32(
; AVX256-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
; AVX256-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
; AVX256-NEXT: [[TMP3:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
; AVX256-NEXT: [[TMP4:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
; AVX256-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]])
; AVX256-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]])
; AVX256-NEXT: store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
; AVX256-NEXT: store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @fcopysign_16f32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcA32 to <16 x float>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcB32 to <16 x float>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.copysign.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]])
; AVX512-NEXT: store <16 x float> [[TMP3]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4		%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4		%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4		%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4		%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4		%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4		%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4		%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
▲ Show 20 Lines • Show All 61 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/fma.ll

	Show First 20 Lines • Show All 152 Lines • ▼ Show 20 Lines
	; NO-FMA-NEXT: store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4			; NO-FMA-NEXT: store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
	; NO-FMA-NEXT: store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4			; NO-FMA-NEXT: store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
	; NO-FMA-NEXT: store double [[FMA4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4			; NO-FMA-NEXT: store double [[FMA4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
	; NO-FMA-NEXT: store double [[FMA5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4			; NO-FMA-NEXT: store double [[FMA5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
	; NO-FMA-NEXT: store double [[FMA6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4			; NO-FMA-NEXT: store double [[FMA6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
	; NO-FMA-NEXT: store double [[FMA7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4			; NO-FMA-NEXT: store double [[FMA7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
	; NO-FMA-NEXT: ret void			; NO-FMA-NEXT: ret void
	;			;
	; FMA256-LABEL: @fma_8f64(			; FMA-LABEL: @fma_8f64(
	; FMA256-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4			; FMA-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcA64 to <8 x double>*), align 4
	; FMA256-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4			; FMA-NEXT: [[TMP2:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcB64 to <8 x double>*), align 4
	; FMA256-NEXT: [[TMP3:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4			; FMA-NEXT: [[TMP3:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcC64 to <8 x double>*), align 4
	; FMA256-NEXT: [[TMP4:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4			; FMA-NEXT: [[TMP4:%.*]] = call <8 x double> @llvm.fma.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x double> [[TMP3]])
	; FMA256-NEXT: [[TMP5:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4			; FMA-NEXT: store <8 x double> [[TMP4]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
	; FMA256-NEXT: [[TMP6:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4			; FMA-NEXT: ret void
	; FMA256-NEXT: [[TMP7:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]])
	; FMA256-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]])
	; FMA256-NEXT: store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
	; FMA256-NEXT: store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
	; FMA256-NEXT: ret void
	;
	; FMA512-LABEL: @fma_8f64(
	; FMA512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcA64 to <8 x double>*), align 4
	; FMA512-NEXT: [[TMP2:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcB64 to <8 x double>*), align 4
	; FMA512-NEXT: [[TMP3:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @srcC64 to <8 x double>*), align 4
	; FMA512-NEXT: [[TMP4:%.*]] = call <8 x double> @llvm.fma.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x double> [[TMP3]])
	; FMA512-NEXT: store <8 x double> [[TMP4]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
	; FMA512-NEXT: ret void
	;			;
	%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4			%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
	%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4			%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
	%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4			%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
	%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4			%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4
	%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4			%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4
	%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4			%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4
	%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4			%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4
	▲ Show 20 Lines • Show All 262 Lines • ▼ Show 20 Lines
	; NO-FMA-NEXT: store float [[FMA10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4			; NO-FMA-NEXT: store float [[FMA10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
	; NO-FMA-NEXT: store float [[FMA11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4			; NO-FMA-NEXT: store float [[FMA11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
	; NO-FMA-NEXT: store float [[FMA12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4			; NO-FMA-NEXT: store float [[FMA12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
	; NO-FMA-NEXT: store float [[FMA13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4			; NO-FMA-NEXT: store float [[FMA13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
	; NO-FMA-NEXT: store float [[FMA14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4			; NO-FMA-NEXT: store float [[FMA14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
	; NO-FMA-NEXT: store float [[FMA15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4			; NO-FMA-NEXT: store float [[FMA15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
	; NO-FMA-NEXT: ret void			; NO-FMA-NEXT: ret void
	;			;
	; FMA256-LABEL: @fma_16f32(			; FMA-LABEL: @fma_16f32(
	; FMA256-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4			; FMA-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcA32 to <16 x float>*), align 4
	; FMA256-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4			; FMA-NEXT: [[TMP2:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcB32 to <16 x float>*), align 4
	; FMA256-NEXT: [[TMP3:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4			; FMA-NEXT: [[TMP3:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcC32 to <16 x float>*), align 4
	; FMA256-NEXT: [[TMP4:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4			; FMA-NEXT: [[TMP4:%.*]] = call <16 x float> @llvm.fma.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x float> [[TMP3]])
	; FMA256-NEXT: [[TMP5:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4			; FMA-NEXT: store <16 x float> [[TMP4]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; FMA256-NEXT: [[TMP6:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4			; FMA-NEXT: ret void
	; FMA256-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]])
	; FMA256-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
	; FMA256-NEXT: store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; FMA256-NEXT: store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; FMA256-NEXT: ret void
	;
	; FMA512-LABEL: @fma_16f32(
	; FMA512-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcA32 to <16 x float>*), align 4
	; FMA512-NEXT: [[TMP2:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcB32 to <16 x float>*), align 4
	; FMA512-NEXT: [[TMP3:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @srcC32 to <16 x float>*), align 4
	; FMA512-NEXT: [[TMP4:%.*]] = call <16 x float> @llvm.fma.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x float> [[TMP3]])
	; FMA512-NEXT: store <16 x float> [[TMP4]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; FMA512-NEXT: ret void
	;			;
	%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4			%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
	%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4			%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
	%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4			%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
	%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4			%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
	%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4			%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
	%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4			%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
	%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4			%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
	▲ Show 20 Lines • Show All 77 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/fptosi.ll

Show First 20 Lines • Show All 75 Lines • ▼ Show 20 Lines
;		;
; AVX512-LABEL: @fptosi_8f64_8i64(		; AVX512-LABEL: @fptosi_8f64_8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
; AVX512-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>		; AVX512-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>
; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; AVX256DQ-LABEL: @fptosi_8f64_8i64(		; AVX256DQ-LABEL: @fptosi_8f64_8i64(
; AVX256DQ-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8		; AVX256DQ-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
; AVX256DQ-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8		; AVX256DQ-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>
; AVX256DQ-NEXT: [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>		; AVX256DQ-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64>
; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX256DQ-NEXT: ret void		; AVX256DQ-NEXT: ret void
;		;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8		%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8		%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8		%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8		%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8		%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8		%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
Show All 14 Lines	;
store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8		store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8		store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8		store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8		store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @fptosi_8f64_8i32() #0 {		define void @fptosi_8f64_8i32() #0 {
; SSE-LABEL: @fptosi_8f64_8i32(		; CHECK-LABEL: @fptosi_8f64_8i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8		; CHECK-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
; SSE-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8		; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i32>
; SSE-NEXT: [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>		; CHECK-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32>		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @fptosi_8f64_8i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
; AVX-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i32>
; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
; AVX-NEXT: ret void
;		;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8		%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8		%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8		%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8		%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8		%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8		%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8		%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
▲ Show 20 Lines • Show All 167 Lines • ▼ Show 20 Lines
;		;
; AVX512-LABEL: @fptosi_8f32_8i64(		; AVX512-LABEL: @fptosi_8f32_8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4		; AVX512-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; AVX512-NEXT: [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i64>		; AVX512-NEXT: [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i64>
; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; AVX256DQ-LABEL: @fptosi_8f32_8i64(		; AVX256DQ-LABEL: @fptosi_8f32_8i64(
; AVX256DQ-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4		; AVX256DQ-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; AVX256DQ-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4		; AVX256DQ-NEXT: [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i64>
; AVX256DQ-NEXT: [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>		; AVX256DQ-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64>
; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX256DQ-NEXT: ret void		; AVX256DQ-NEXT: ret void
;		;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4		%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4		%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4		%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4		%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4		%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4		%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
Show All 14 Lines	;
store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8		store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8		store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8		store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8		store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @fptosi_8f32_8i32() #0 {		define void @fptosi_8f32_8i32() #0 {
; SSE-LABEL: @fptosi_8f32_8i32(		; CHECK-LABEL: @fptosi_8f32_8i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i32>
; SSE-NEXT: [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>		; CHECK-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32>		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @fptosi_8f32_8i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; AVX-NEXT: [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i32>
; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
; AVX-NEXT: ret void
;		;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4		%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4		%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4		%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4		%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4		%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4		%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4		%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
▲ Show 20 Lines • Show All 160 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/fptoui.ll

	Show First 20 Lines • Show All 75 Lines • ▼ Show 20 Lines
	;			;
	; AVX512-LABEL: @fptoui_8f64_8i64(			; AVX512-LABEL: @fptoui_8f64_8i64(
	; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8			; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>			; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
	; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8			; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
	; AVX512-NEXT: ret void			; AVX512-NEXT: ret void
	;			;
	; AVX256DQ-LABEL: @fptoui_8f64_8i64(			; AVX256DQ-LABEL: @fptoui_8f64_8i64(
	; AVX256DQ-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8			; AVX256DQ-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; AVX256DQ-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8			; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
	; AVX256DQ-NEXT: [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64>			; AVX256DQ-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
	; AVX256DQ-NEXT: [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i64>
	; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
	; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX256DQ-NEXT: ret void			; AVX256DQ-NEXT: ret void
	;			;
	%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8			%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
	%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8			%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
	%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8			%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
	%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8			%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
	%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8			%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
	%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8			%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
	▲ Show 20 Lines • Show All 289 Lines • ▼ Show 20 Lines
	;			;
	; AVX512-LABEL: @fptoui_8f32_8i64(			; AVX512-LABEL: @fptoui_8f32_8i64(
	; AVX512-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4			; AVX512-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>			; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>
	; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8			; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
	; AVX512-NEXT: ret void			; AVX512-NEXT: ret void
	;			;
	; AVX256DQ-LABEL: @fptoui_8f32_8i64(			; AVX256DQ-LABEL: @fptoui_8f32_8i64(
	; AVX256DQ-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; AVX256DQ-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX256DQ-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>
	; AVX256DQ-NEXT: [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64>			; AVX256DQ-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
	; AVX256DQ-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i64>
	; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
	; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
	; AVX256DQ-NEXT: ret void			; AVX256DQ-NEXT: ret void
	;			;
	%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4			%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
	%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4			%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
	%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4			%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
	%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4			%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
	%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4			%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
	%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4			%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
	▲ Show 20 Lines • Show All 232 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/fround.ll

	Show First 20 Lines • Show All 67 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]])			; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]])
	; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8			; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
	; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8			; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
	; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8			; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
	; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @ceil_4f64(			; SSE41-LABEL: @ceil_4f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])			; SSE41-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]])
	; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @ceil_4f64(			; AVX-LABEL: @ceil_4f64(
	; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8			; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
	; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8			; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	Show All 36 Lines
	; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8			; SSE2-NEXT: store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
	; SSE2-NEXT: store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8			; SSE2-NEXT: store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
	; SSE2-NEXT: store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8			; SSE2-NEXT: store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
	; SSE2-NEXT: store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8			; SSE2-NEXT: store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @ceil_8f64(			; SSE41-LABEL: @ceil_8f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8			; SSE41-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]])			; AVX-LABEL: @ceil_8f64(
	; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8			; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8			; AVX-NEXT: ret void
	; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @ceil_8f64(
	; AVX1-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]])
	; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @ceil_8f64(
	; AVX2-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]])
	; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @ceil_8f64(
	; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[TMP1]])
	; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8			%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
	%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8			%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
	%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8			%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
	%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8			%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
	%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8			%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
	%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8			%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
	%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8			%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
	▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]])			; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]])
	; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8			; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
	; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8			; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
	; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8			; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
	; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @floor_4f64(			; SSE41-LABEL: @floor_4f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])			; SSE41-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]])
	; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @floor_4f64(			; AVX-LABEL: @floor_4f64(
	; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8			; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
	; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8			; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	Show All 36 Lines
	; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8			; SSE2-NEXT: store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
	; SSE2-NEXT: store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8			; SSE2-NEXT: store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
	; SSE2-NEXT: store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8			; SSE2-NEXT: store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
	; SSE2-NEXT: store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8			; SSE2-NEXT: store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @floor_8f64(			; SSE41-LABEL: @floor_8f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8			; SSE41-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]])			; AVX-LABEL: @floor_8f64(
	; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8			; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8			; AVX-NEXT: ret void
	; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @floor_8f64(
	; AVX1-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]])
	; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @floor_8f64(
	; AVX2-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]])
	; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @floor_8f64(
	; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[TMP1]])
	; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8			%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
	%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8			%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
	%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8			%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
	%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8			%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
	%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8			%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
	%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8			%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
	%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8			%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
	▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]])			; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]])
	; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8			; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
	; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8			; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
	; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8			; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
	; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @nearbyint_4f64(			; SSE41-LABEL: @nearbyint_4f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])			; SSE41-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]])
	; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @nearbyint_4f64(			; AVX-LABEL: @nearbyint_4f64(
	; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8			; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
	; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8			; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	Show All 36 Lines
	; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8			; SSE2-NEXT: store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
	; SSE2-NEXT: store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8			; SSE2-NEXT: store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
	; SSE2-NEXT: store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8			; SSE2-NEXT: store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
	; SSE2-NEXT: store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8			; SSE2-NEXT: store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @nearbyint_8f64(			; SSE41-LABEL: @nearbyint_8f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8			; SSE41-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]])			; AVX-LABEL: @nearbyint_8f64(
	; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8			; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8			; AVX-NEXT: ret void
	; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @nearbyint_8f64(
	; AVX1-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]])
	; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @nearbyint_8f64(
	; AVX2-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]])
	; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @nearbyint_8f64(
	; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> [[TMP1]])
	; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8			%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
	%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8			%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
	%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8			%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
	%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8			%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
	%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8			%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
	%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8			%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
	%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8			%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
	▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]])			; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]])
	; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8			; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
	; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8			; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
	; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8			; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
	; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @rint_4f64(			; SSE41-LABEL: @rint_4f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])			; SSE41-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]])
	; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @rint_4f64(			; AVX-LABEL: @rint_4f64(
	; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8			; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
	; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8			; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	Show All 36 Lines
	; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8			; SSE2-NEXT: store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
	; SSE2-NEXT: store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8			; SSE2-NEXT: store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
	; SSE2-NEXT: store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8			; SSE2-NEXT: store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
	; SSE2-NEXT: store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8			; SSE2-NEXT: store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @rint_8f64(			; SSE41-LABEL: @rint_8f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.rint.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8			; SSE41-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]])			; AVX-LABEL: @rint_8f64(
	; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.rint.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8			; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8			; AVX-NEXT: ret void
	; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @rint_8f64(
	; AVX1-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]])
	; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @rint_8f64(
	; AVX2-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]])
	; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @rint_8f64(
	; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.rint.v8f64(<8 x double> [[TMP1]])
	; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8			%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
	%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8			%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
	%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8			%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
	%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8			%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
	%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8			%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
	%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8			%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
	%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8			%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
	▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]])			; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]])
	; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8			; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
	; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8			; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
	; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8			; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
	; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @trunc_4f64(			; SSE41-LABEL: @trunc_4f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])			; SSE41-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]])
	; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @trunc_4f64(			; AVX-LABEL: @trunc_4f64(
	; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8			; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
	; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8			; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	Show All 36 Lines
	; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8			; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
	; SSE2-NEXT: store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8			; SSE2-NEXT: store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
	; SSE2-NEXT: store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8			; SSE2-NEXT: store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
	; SSE2-NEXT: store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8			; SSE2-NEXT: store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
	; SSE2-NEXT: store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8			; SSE2-NEXT: store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @trunc_8f64(			; SSE41-LABEL: @trunc_8f64(
	; SSE41-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8			; SSE41-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.trunc.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8			; SSE41-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP4:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]])			; AVX-LABEL: @trunc_8f64(
	; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.trunc.v8f64(<8 x double> [[TMP1]])
	; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8			; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8			; AVX-NEXT: ret void
	; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
	; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @trunc_8f64(
	; AVX1-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]])
	; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @trunc_8f64(
	; AVX2-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]])
	; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
	; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @trunc_8f64(
	; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 8
	; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.trunc.v8f64(<8 x double> [[TMP1]])
	; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8			%ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
	%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8			%ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
	%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8			%ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
	%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8			%ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
	%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8			%ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
	%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8			%ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
	%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8			%ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
	▲ Show 20 Lines • Show All 84 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4			; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
	; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4			; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
	; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4			; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
	; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4			; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
	; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4			; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @ceil_8f32(			; SSE41-LABEL: @ceil_8f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])			; SSE41-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]])
	; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @ceil_8f32(			; AVX-LABEL: @ceil_8f32(
	; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4			; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
	; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4			; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4			; SSE2-NEXT: store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
	; SSE2-NEXT: store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4			; SSE2-NEXT: store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
	; SSE2-NEXT: store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4			; SSE2-NEXT: store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
	; SSE2-NEXT: store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4			; SSE2-NEXT: store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
	; SSE2-NEXT: store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4			; SSE2-NEXT: store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @ceil_16f32(			; SSE41-LABEL: @ceil_16f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4			; SSE41-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]])			; AVX-LABEL: @ceil_16f32(
	; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4			; AVX-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4			; AVX-NEXT: ret void
	; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @ceil_16f32(
	; AVX1-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]])
	; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @ceil_16f32(
	; AVX2-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]])
	; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @ceil_16f32(
	; AVX512-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[TMP1]])
	; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4			%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
	%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4			%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
	%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4			%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
	%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4			%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
	%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4			%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
	%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4			%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
	%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4			%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
	▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4			; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
	; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4			; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
	; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4			; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
	; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4			; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
	; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4			; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @floor_8f32(			; SSE41-LABEL: @floor_8f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])			; SSE41-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]])
	; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @floor_8f32(			; AVX-LABEL: @floor_8f32(
	; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4			; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
	; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4			; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4			; SSE2-NEXT: store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
	; SSE2-NEXT: store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4			; SSE2-NEXT: store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
	; SSE2-NEXT: store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4			; SSE2-NEXT: store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
	; SSE2-NEXT: store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4			; SSE2-NEXT: store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
	; SSE2-NEXT: store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4			; SSE2-NEXT: store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @floor_16f32(			; SSE41-LABEL: @floor_16f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4			; SSE41-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]])			; AVX-LABEL: @floor_16f32(
	; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4			; AVX-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4			; AVX-NEXT: ret void
	; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @floor_16f32(
	; AVX1-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]])
	; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @floor_16f32(
	; AVX2-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]])
	; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @floor_16f32(
	; AVX512-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[TMP1]])
	; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4			%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
	%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4			%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
	%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4			%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
	%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4			%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
	%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4			%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
	%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4			%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
	%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4			%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
	▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4			; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
	; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4			; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
	; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4			; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
	; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4			; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
	; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4			; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @nearbyint_8f32(			; SSE41-LABEL: @nearbyint_8f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])			; SSE41-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]])
	; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @nearbyint_8f32(			; AVX-LABEL: @nearbyint_8f32(
	; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4			; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
	; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4			; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4			; SSE2-NEXT: store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
	; SSE2-NEXT: store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4			; SSE2-NEXT: store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
	; SSE2-NEXT: store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4			; SSE2-NEXT: store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
	; SSE2-NEXT: store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4			; SSE2-NEXT: store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
	; SSE2-NEXT: store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4			; SSE2-NEXT: store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @nearbyint_16f32(			; SSE41-LABEL: @nearbyint_16f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4			; SSE41-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]])			; AVX-LABEL: @nearbyint_16f32(
	; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4			; AVX-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4			; AVX-NEXT: ret void
	; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @nearbyint_16f32(
	; AVX1-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]])
	; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @nearbyint_16f32(
	; AVX2-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]])
	; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @nearbyint_16f32(
	; AVX512-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> [[TMP1]])
	; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4			%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
	%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4			%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
	%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4			%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
	%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4			%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
	%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4			%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
	%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4			%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
	%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4			%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
	▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4			; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
	; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4			; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
	; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4			; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
	; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4			; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
	; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4			; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @rint_8f32(			; SSE41-LABEL: @rint_8f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])			; SSE41-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]])
	; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @rint_8f32(			; AVX-LABEL: @rint_8f32(
	; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4			; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
	; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4			; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4			; SSE2-NEXT: store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
	; SSE2-NEXT: store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4			; SSE2-NEXT: store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
	; SSE2-NEXT: store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4			; SSE2-NEXT: store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
	; SSE2-NEXT: store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4			; SSE2-NEXT: store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
	; SSE2-NEXT: store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4			; SSE2-NEXT: store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @rint_16f32(			; SSE41-LABEL: @rint_16f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.rint.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4			; SSE41-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]])			; AVX-LABEL: @rint_16f32(
	; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.rint.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4			; AVX-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4			; AVX-NEXT: ret void
	; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @rint_16f32(
	; AVX1-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]])
	; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @rint_16f32(
	; AVX2-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]])
	; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @rint_16f32(
	; AVX512-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.rint.v16f32(<16 x float> [[TMP1]])
	; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4			%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
	%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4			%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
	%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4			%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
	%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4			%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
	%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4			%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
	%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4			%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
	%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4			%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
	▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4			; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
	; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4			; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
	; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4			; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
	; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4			; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
	; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4			; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @trunc_8f32(			; SSE41-LABEL: @trunc_8f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])			; SSE41-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]])
	; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
	; SSE41-NEXT: ret void			; SSE41-NEXT: ret void
	;			;
	; AVX-LABEL: @trunc_8f32(			; AVX-LABEL: @trunc_8f32(
	; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4			; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])			; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
	; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4			; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX-NEXT: ret void			; AVX-NEXT: ret void
	;			;
	▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4			; SSE2-NEXT: store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
	; SSE2-NEXT: store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4			; SSE2-NEXT: store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
	; SSE2-NEXT: store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4			; SSE2-NEXT: store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
	; SSE2-NEXT: store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4			; SSE2-NEXT: store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
	; SSE2-NEXT: store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4			; SSE2-NEXT: store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
	; SSE2-NEXT: ret void			; SSE2-NEXT: ret void
	;			;
	; SSE41-LABEL: @trunc_16f32(			; SSE41-LABEL: @trunc_16f32(
	; SSE41-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4			; SSE41-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4			; SSE41-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.trunc.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4			; SSE41-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4			; SSE41-NEXT: ret void
	; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])			;
	; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]])			; AVX-LABEL: @trunc_16f32(
	; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]])			; AVX-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP4]])			; AVX-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.trunc.v16f32(<16 x float> [[TMP1]])
	; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4			; AVX-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4			; AVX-NEXT: ret void
	; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
	; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
	; SSE41-NEXT: ret void
	;
	; AVX1-LABEL: @trunc_16f32(
	; AVX1-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
	; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]])
	; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX1-NEXT: ret void
	;
	; AVX2-LABEL: @trunc_16f32(
	; AVX2-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
	; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]])
	; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
	; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
	; AVX2-NEXT: ret void
	;
	; AVX512-LABEL: @trunc_16f32(
	; AVX512-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
	; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.trunc.v16f32(<16 x float> [[TMP1]])
	; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
	; AVX512-NEXT: ret void
	;			;
	%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4			%ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
	%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4			%ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
	%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4			%ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
	%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4			%ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
	%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4			%ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
	%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4			%ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
	%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4			%ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
	▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll

	Show First 20 Lines • Show All 406 Lines • ▼ Show 20 Lines
	}			}

	@ib = local_unnamed_addr global [64 x i32] [i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0], align 16			@ib = local_unnamed_addr global [64 x i32] [i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0], align 16
	@ia = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16			@ia = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16

	define i32 @foo1() local_unnamed_addr #0 {			define i32 @foo1() local_unnamed_addr #0 {
	; CHECK-LABEL: @foo1(			; CHECK-LABEL: @foo1(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.]] = load <4 x i32>, <4 x i32> bitcast ([64 x i32]* @ib to <4 x i32>*), align 16			; CHECK-NEXT: [[TMP0:%.]] = load <64 x i32>, <64 x i32> bitcast ([64 x i32]* @ib to <64 x i32>*), align 16
	; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> [[TMP0]], <i32 -1, i32 -1, i32 -1, i32 -1>			; CHECK-NEXT: [[TMP1:%.*]] = xor <64 x i32> [[TMP0]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([64 x i32]* @ia to <4 x i32>*), align 16			; CHECK-NEXT: store <64 x i32> [[TMP1]], <64 x i32>* bitcast ([64 x i32]* @ia to <64 x i32>*), align 16
	; CHECK-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 4) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 4) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 8) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 8) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 12) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP6]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 12) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 16) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 16) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP10:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 20) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i32> [[TMP10]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 20) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP12:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 24) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 24) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP14:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 28) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP14]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 28) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP16:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 32) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i32> [[TMP16]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 32) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP18:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 36) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP19:%.*]] = xor <4 x i32> [[TMP18]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP19]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 36) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP20:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 40) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP21:%.*]] = xor <4 x i32> [[TMP20]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 40) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP22:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 44) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i32> [[TMP22]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP23]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 44) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP24:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 48) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP25:%.*]] = xor <4 x i32> [[TMP24]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 48) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP26:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 52) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP27:%.*]] = xor <4 x i32> [[TMP26]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP27]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 52) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP28:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 56) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP29:%.*]] = xor <4 x i32> [[TMP28]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 56) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP30:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 60) to <4 x i32>*), align 16
	; CHECK-NEXT: [[TMP31:%.*]] = xor <4 x i32> [[TMP30]], <i32 -1, i32 -1, i32 -1, i32 -1>
	; CHECK-NEXT: store <4 x i32> [[TMP31]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 60) to <4 x i32>*), align 16
	; CHECK-NEXT: br label [[FOR_BODY5:%.*]]			; CHECK-NEXT: br label [[FOR_BODY5:%.*]]
	; CHECK: for.cond3:			; CHECK: for.cond3:
	; CHECK-NEXT: [[INDVARS_IV_NEXT:%.]] = add nuw nsw i64 [[INDVARS_IV:%.]], 1			; CHECK-NEXT: [[INDVARS_IV_NEXT:%.]] = add nuw nsw i64 [[INDVARS_IV:%.]], 1
	; CHECK-NEXT: [[CMP4:%.*]] = icmp ult i64 [[INDVARS_IV]], 63			; CHECK-NEXT: [[CMP4:%.*]] = icmp ult i64 [[INDVARS_IV]], 63
	; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY5]], label [[FOR_END14:%.*]]			; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY5]], label [[FOR_END14:%.*]]
	; CHECK: for.body5:			; CHECK: for.body5:
	; CHECK-NEXT: [[INDVARS_IV]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDVARS_IV_NEXT]], [[FOR_COND3:%.]] ]			; CHECK-NEXT: [[INDVARS_IV]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDVARS_IV_NEXT]], [[FOR_COND3:%.]] ]
	; CHECK-NEXT: [[ARRAYIDX7:%.]] = getelementptr inbounds [64 x i32], [64 x i32] @ia, i64 0, i64 [[INDVARS_IV]]			; CHECK-NEXT: [[ARRAYIDX7:%.]] = getelementptr inbounds [64 x i32], [64 x i32] @ia, i64 0, i64 [[INDVARS_IV]]
	; CHECK-NEXT: [[TMP32:%.]] = load i32, i32 [[ARRAYIDX7]], align 4			; CHECK-NEXT: [[TMP2:%.]] = load i32, i32 [[ARRAYIDX7]], align 4
	; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds [64 x i32], [64 x i32] @ib, i64 0, i64 [[INDVARS_IV]]			; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds [64 x i32], [64 x i32] @ib, i64 0, i64 [[INDVARS_IV]]
	; CHECK-NEXT: [[TMP33:%.]] = load i32, i32 [[ARRAYIDX9]], align 4			; CHECK-NEXT: [[TMP3:%.]] = load i32, i32 [[ARRAYIDX9]], align 4
	; CHECK-NEXT: [[NEG10:%.*]] = xor i32 [[TMP33]], -1			; CHECK-NEXT: [[NEG10:%.*]] = xor i32 [[TMP3]], -1
	; CHECK-NEXT: [[CMP11:%.*]] = icmp eq i32 [[TMP32]], [[NEG10]]			; CHECK-NEXT: [[CMP11:%.*]] = icmp eq i32 [[TMP2]], [[NEG10]]
	; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_COND3]], label [[IF_THEN:%.*]]			; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_COND3]], label [[IF_THEN:%.*]]
	; CHECK: if.then:			; CHECK: if.then:
	; CHECK-NEXT: tail call void @abort()			; CHECK-NEXT: tail call void @abort()
	; CHECK-NEXT: unreachable			; CHECK-NEXT: unreachable
	; CHECK: for.end14:			; CHECK: for.end14:
	; CHECK-NEXT: ret i32 0			; CHECK-NEXT: ret i32 0
	;			;
	entry:			entry:
	▲ Show 20 Lines • Show All 218 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/pr19657.ll

	Show All 14 Lines
	; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], [[TMP5]]			; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], [[TMP5]]
	; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[TMP6]], [[TMP5]]			; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[TMP6]], [[TMP5]]
	; CHECK-NEXT: [[TMP8:%.]] = bitcast double [[X]] to <4 x double>*			; CHECK-NEXT: [[TMP8:%.]] = bitcast double [[X]] to <4 x double>*
	; CHECK-NEXT: store <4 x double> [[TMP7]], <4 x double>* [[TMP8]], align 8			; CHECK-NEXT: store <4 x double> [[TMP7]], <4 x double>* [[TMP8]], align 8
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	; V128-LABEL: @foo(			; V128-LABEL: @foo(
	; V128-NEXT: [[TMP1:%.]] = getelementptr inbounds double, double [[X:%.*]], i64 1			; V128-NEXT: [[TMP1:%.]] = getelementptr inbounds double, double [[X:%.*]], i64 1
	; V128-NEXT: [[TMP2:%.]] = bitcast double [[X]] to <2 x double>*			; V128-NEXT: [[TMP2:%.]] = getelementptr inbounds double, double [[X]], i64 2
	; V128-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> [[TMP2]], align 8			; V128-NEXT: [[TMP3:%.]] = getelementptr inbounds double, double [[X]], i64 3
	; V128-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP3]]			; V128-NEXT: [[TMP4:%.]] = bitcast double [[X]] to <4 x double>*
	; V128-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP3]]			; V128-NEXT: [[TMP5:%.]] = load <4 x double>, <4 x double> [[TMP4]], align 8
	; V128-NEXT: [[TMP6:%.]] = bitcast double [[X]] to <2 x double>*			; V128-NEXT: [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], [[TMP5]]
	; V128-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8			; V128-NEXT: [[TMP7:%.*]] = fadd <4 x double> [[TMP6]], [[TMP5]]
	; V128-NEXT: [[TMP7:%.]] = getelementptr inbounds double, double [[X]], i64 2			; V128-NEXT: [[TMP8:%.]] = bitcast double [[X]] to <4 x double>*
	; V128-NEXT: [[TMP8:%.]] = getelementptr inbounds double, double [[X]], i64 3			; V128-NEXT: store <4 x double> [[TMP7]], <4 x double>* [[TMP8]], align 8
	; V128-NEXT: [[TMP9:%.]] = bitcast double [[TMP7]] to <2 x double>*
	; V128-NEXT: [[TMP10:%.]] = load <2 x double>, <2 x double> [[TMP9]], align 8
	; V128-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], [[TMP10]]
	; V128-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], [[TMP10]]
	; V128-NEXT: [[TMP13:%.]] = bitcast double [[TMP7]] to <2 x double>*
	; V128-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8
	; V128-NEXT: ret void			; V128-NEXT: ret void
	;			;
	%1 = load double, double* %x, align 8			%1 = load double, double* %x, align 8
	%2 = fadd double %1, %1			%2 = fadd double %1, %1
	%3 = fadd double %2, %1			%3 = fadd double %2, %1
	store double %3, double* %x, align 8			store double %3, double* %x, align 8
	%4 = getelementptr inbounds double, double* %x, i64 1			%4 = getelementptr inbounds double, double* %x, i64 1
	%5 = load double, double* %4, align 8			%5 = load double, double* %4, align 8
	Show All 16 Lines

test/Transforms/SLPVectorizer/X86/pr35497.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s \| FileCheck %s			; RUN: opt -slp-vectorizer -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s \| FileCheck %s
				RKSimonUnsubmitted Not Done Reply Inline Actions Why the duplicated -slp-vectorizer? RKSimon: Why the duplicated -slp-vectorizer?
				ABataevAuthorUnsubmitted Done Reply Inline Actions Need to run the pass twice to get the code in `pr35497` fully vectorized. ABataev: Need to run the pass twice to get the code in `pr35497` fully vectorized.

	%class.1 = type { %class.2 }			%class.1 = type { %class.2 }
	%class.2 = type { %"class.3" }			%class.2 = type { %"class.3" }
	%"class.3" = type { %"struct.1", i64 }			%"class.3" = type { %"struct.1", i64 }
	%"struct.1" = type { [8 x i64] }			%"struct.1" = type { [8 x i64] }

	$_ZN1C10SwitchModeEv = comdat any			$_ZN1C10SwitchModeEv = comdat any

	▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1			; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
	; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0			; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0
	; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1			; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
	; CHECK-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>			; CHECK-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
	; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>			; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
	; CHECK-NEXT: [[ARRAYIDX2_6:%.]] = getelementptr inbounds [0 x i64], [0 x i64] undef, i64 0, i64 0			; CHECK-NEXT: [[ARRAYIDX2_6:%.]] = getelementptr inbounds [0 x i64], [0 x i64] undef, i64 0, i64 0
	; CHECK-NEXT: [[TMP10:%.]] = bitcast i64 [[ARRAYIDX2_6]] to <2 x i64>*			; CHECK-NEXT: [[TMP10:%.]] = bitcast i64 [[ARRAYIDX2_6]] to <2 x i64>*
	; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1			; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
	; CHECK-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>			; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
	; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]			; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> undef, i64 [[TMP11]], i32 0
	; CHECK-NEXT: [[TMP13:%.]] = bitcast i64 [[ARRAYIDX2_2]] to <2 x i64>*			; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1
	; CHECK-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1			; CHECK-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], <i64 6, i64 6>
				; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]]
				; CHECK-NEXT: [[TMP16:%.]] = bitcast i64 [[ARRAYIDX2_2]] to <2 x i64>*
				; CHECK-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	entry:			entry:
	%0 = load i64, i64* undef, align 1			%0 = load i64, i64* undef, align 1
	%and = shl i64 %0, 2			%and = shl i64 %0, 2
	%shl = and i64 %and, 20			%shl = and i64 %and, 20
	%add = add i64 undef, undef			%add = add i64 undef, undef
	store i64 %add, i64* undef, align 1			store i64 %add, i64* undef, align 1
	Show All 24 Lines

test/Transforms/SLPVectorizer/X86/shift-ashr.ll

Show First 20 Lines • Show All 86 Lines • ▼ Show 20 Lines
; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8		; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @ashr_v8i64(		; AVX2-LABEL: @ashr_v8i64(
; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]]
; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
; AVX2-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @ashr_v8i64(		; AVX512-LABEL: @ashr_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]]		; AVX512-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; XOP-LABEL: @ashr_v8i64(		; XOP-LABEL: @ashr_v8i64(
; XOP-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; XOP-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; XOP-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; XOP-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; XOP-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; XOP-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]]
; XOP-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; XOP-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; XOP-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; XOP-NEXT: ret void		; XOP-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
▲ Show 20 Lines • Show All 109 Lines • ▼ Show 20 Lines
; AVX1-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]		; AVX1-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]
; AVX1-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4		; AVX1-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; AVX1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4		; AVX1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; AVX1-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4		; AVX1-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4		; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @ashr_v16i32(		; AVX2-LABEL: @ashr_v16i32(
; AVX2-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4		; AVX2-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX2-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX2-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX2-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4		; AVX2-NEXT: [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]]
; AVX2-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX2-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX2-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
; AVX2-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
; AVX2-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX2-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @ashr_v16i32(		; AVX512-LABEL: @ashr_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4		; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4		; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]]		; AVX512-NEXT: [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4		; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; XOP-LABEL: @ashr_v16i32(		; XOP-LABEL: @ashr_v16i32(
; XOP-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4		; XOP-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; XOP-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4		; XOP-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; XOP-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4		; XOP-NEXT: [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]]
; XOP-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4		; XOP-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; XOP-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; XOP-NEXT: ret void		; XOP-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
▲ Show 20 Lines • Show All 186 Lines • ▼ Show 20 Lines
; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2		; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2		; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX-LABEL: @ashr_v32i16(		; AVX-LABEL: @ashr_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2		; AVX-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2		; AVX-NEXT: [[TMP3:%.*]] = ashr <32 x i16> [[TMP1]], [[TMP2]]
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void		; AVX-NEXT: ret void
;		;
; AVX512-LABEL: @ashr_v32i16(		; AVX512-LABEL: @ashr_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2		; AVX512-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX512-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2		; AVX512-NEXT: [[TMP3:%.*]] = ashr <32 x i16> [[TMP1]], [[TMP2]]
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX512-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
; AVX512-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; XOP-LABEL: @ashr_v32i16(		; XOP-LABEL: @ashr_v32i16(
; XOP-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2		; XOP-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; XOP-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2		; XOP-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; XOP-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2		; XOP-NEXT: [[TMP3:%.*]] = ashr <32 x i16> [[TMP1]], [[TMP2]]
; XOP-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2		; XOP-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; XOP-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; XOP-NEXT: ret void		; XOP-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
▲ Show 20 Lines • Show All 119 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @ashr_v64i8() {		define void @ashr_v64i8() {
; CHECK-LABEL: @ashr_v64i8(		; CHECK-LABEL: @ashr_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = ashr <64 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/shift-lshr.ll

Show All 16 Lines
@b16 = common global [32 x i16] zeroinitializer, align 64		@b16 = common global [32 x i16] zeroinitializer, align 64
@c16 = common global [32 x i16] zeroinitializer, align 64		@c16 = common global [32 x i16] zeroinitializer, align 64
@a8 = common global [64 x i8] zeroinitializer, align 64		@a8 = common global [64 x i8] zeroinitializer, align 64
@b8 = common global [64 x i8] zeroinitializer, align 64		@b8 = common global [64 x i8] zeroinitializer, align 64
@c8 = common global [64 x i8] zeroinitializer, align 64		@c8 = common global [64 x i8] zeroinitializer, align 64

define void @lshr_v8i64() {		define void @lshr_v8i64() {
; SSE-LABEL: @lshr_v8i64(		; SSE-LABEL: @lshr_v8i64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; SSE-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; SSE-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; SSE-NEXT: [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; SSE-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; SSE-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX1-LABEL: @lshr_v8i64(		; AVX1-LABEL: @lshr_v8i64(
; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]		; AVX1-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
; AVX1-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]		; AVX1-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
; AVX1-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]		; AVX1-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
; AVX1-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]		; AVX1-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @lshr_v8i64(		; AVX2-LABEL: @lshr_v8i64(
; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]
; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
; AVX2-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @lshr_v8i64(		; AVX512-LABEL: @lshr_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]		; AVX512-NEXT: [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; XOP-LABEL: @lshr_v8i64(		; XOP-LABEL: @lshr_v8i64(
; XOP-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; XOP-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; XOP-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; XOP-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; XOP-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; XOP-NEXT: [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]
; XOP-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; XOP-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; XOP-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; XOP-NEXT: ret void		; XOP-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
▲ Show 20 Lines • Show All 90 Lines • ▼ Show 20 Lines
; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4		; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX-LABEL: @lshr_v16i32(		; AVX-LABEL: @lshr_v16i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4		; AVX-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4		; AVX-NEXT: [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]]
; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4		; AVX-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: ret void		; AVX-NEXT: ret void
;		;
; AVX512-LABEL: @lshr_v16i32(		; AVX512-LABEL: @lshr_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4		; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4		; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]]		; AVX512-NEXT: [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4		; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; XOP-LABEL: @lshr_v16i32(		; XOP-LABEL: @lshr_v16i32(
; XOP-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4		; XOP-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; XOP-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4		; XOP-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; XOP-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4		; XOP-NEXT: [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]]
; XOP-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4		; XOP-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; XOP-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; XOP-NEXT: ret void		; XOP-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
▲ Show 20 Lines • Show All 186 Lines • ▼ Show 20 Lines
; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2		; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2		; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX-LABEL: @lshr_v32i16(		; AVX-LABEL: @lshr_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2		; AVX-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2		; AVX-NEXT: [[TMP3:%.*]] = lshr <32 x i16> [[TMP1]], [[TMP2]]
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void		; AVX-NEXT: ret void
;		;
; AVX512-LABEL: @lshr_v32i16(		; AVX512-LABEL: @lshr_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2		; AVX512-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX512-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2		; AVX512-NEXT: [[TMP3:%.*]] = lshr <32 x i16> [[TMP1]], [[TMP2]]
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX512-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
; AVX512-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; XOP-LABEL: @lshr_v32i16(		; XOP-LABEL: @lshr_v32i16(
; XOP-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2		; XOP-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; XOP-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2		; XOP-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; XOP-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2		; XOP-NEXT: [[TMP3:%.*]] = lshr <32 x i16> [[TMP1]], [[TMP2]]
; XOP-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2		; XOP-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; XOP-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; XOP-NEXT: ret void		; XOP-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
▲ Show 20 Lines • Show All 119 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @lshr_v64i8() {		define void @lshr_v64i8() {
; CHECK-LABEL: @lshr_v64i8(		; CHECK-LABEL: @lshr_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = lshr <64 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]]
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/shift-shl.ll

Show All 16 Lines
@b16 = common global [32 x i16] zeroinitializer, align 64		@b16 = common global [32 x i16] zeroinitializer, align 64
@c16 = common global [32 x i16] zeroinitializer, align 64		@c16 = common global [32 x i16] zeroinitializer, align 64
@a8 = common global [64 x i8] zeroinitializer, align 64		@a8 = common global [64 x i8] zeroinitializer, align 64
@b8 = common global [64 x i8] zeroinitializer, align 64		@b8 = common global [64 x i8] zeroinitializer, align 64
@c8 = common global [64 x i8] zeroinitializer, align 64		@c8 = common global [64 x i8] zeroinitializer, align 64

define void @shl_v8i64() {		define void @shl_v8i64() {
; SSE-LABEL: @shl_v8i64(		; SSE-LABEL: @shl_v8i64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; SSE-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; SSE-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; SSE-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; SSE-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; SSE-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; SSE-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX1-LABEL: @shl_v8i64(		; AVX1-LABEL: @shl_v8i64(
; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP5:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP6:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP7:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: [[TMP8:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]		; AVX1-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
; AVX1-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]		; AVX1-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
; AVX1-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]		; AVX1-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
; AVX1-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]		; AVX1-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8		; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; AVX1-NEXT: ret void		; AVX1-NEXT: ret void
;		;
; AVX2-LABEL: @shl_v8i64(		; AVX2-LABEL: @shl_v8i64(
; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; AVX2-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]
; AVX2-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; AVX2-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX2-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
; AVX2-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; AVX2-NEXT: ret void		; AVX2-NEXT: ret void
;		;
; AVX512-LABEL: @shl_v8i64(		; AVX512-LABEL: @shl_v8i64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; AVX512-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]		; AVX512-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; XOP-LABEL: @shl_v8i64(		; XOP-LABEL: @shl_v8i64(
; XOP-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8		; XOP-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
; XOP-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8		; XOP-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
; XOP-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8		; XOP-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]
; XOP-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8		; XOP-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
; XOP-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
; XOP-NEXT: ret void		; XOP-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
Show All 22 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @shl_v16i32() {		define void @shl_v16i32() {
; SSE-LABEL: @shl_v16i32(		; CHECK-LABEL: @shl_v16i32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4		; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i32> [[TMP1]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4		; CHECK-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; SSE-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP1]], [[TMP5]]
; SSE-NEXT: [[TMP10:%.*]] = shl <4 x i32> [[TMP2]], [[TMP6]]
; SSE-NEXT: [[TMP11:%.*]] = shl <4 x i32> [[TMP3]], [[TMP7]]
; SSE-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP4]], [[TMP8]]
; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @shl_v16i32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; AVX-NEXT: ret void
;
; AVX512-LABEL: @shl_v16i32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
; AVX512-NEXT: [[TMP3:%.*]] = shl <16 x i32> [[TMP1]], [[TMP2]]
; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
; AVX512-NEXT: ret void
;
; XOP-LABEL: @shl_v16i32(
; XOP-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; XOP-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
; XOP-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
; XOP-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
; XOP-NEXT: [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
; XOP-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 185 Lines • ▼ Show 20 Lines
; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2		; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2		; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX-LABEL: @shl_v32i16(		; AVX-LABEL: @shl_v32i16(
; AVX-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2		; AVX-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; AVX-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2		; AVX-NEXT: [[TMP3:%.*]] = shl <32 x i16> [[TMP1]], [[TMP2]]
; AVX-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; AVX-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX-NEXT: ret void		; AVX-NEXT: ret void
;		;
; AVX512-LABEL: @shl_v32i16(		; AVX512-LABEL: @shl_v32i16(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2		; AVX512-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX512-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2		; AVX512-NEXT: [[TMP3:%.*]] = shl <32 x i16> [[TMP1]], [[TMP2]]
; AVX512-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2		; AVX512-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; AVX512-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
; AVX512-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; XOP-LABEL: @shl_v32i16(		; XOP-LABEL: @shl_v32i16(
; XOP-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2		; XOP-NEXT: [[TMP1:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2
; XOP-NEXT: [[TMP2:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2		; XOP-NEXT: [[TMP2:%.]] = load <32 x i16>, <32 x i16> bitcast ([32 x i16]* @b16 to <32 x i16>*), align 2
; XOP-NEXT: [[TMP3:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2		; XOP-NEXT: [[TMP3:%.*]] = shl <32 x i16> [[TMP1]], [[TMP2]]
; XOP-NEXT: [[TMP4:%.]] = load <16 x i16>, <16 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2		; XOP-NEXT: store <32 x i16> [[TMP3]], <32 x i16>* bitcast ([32 x i16]* @c16 to <32 x i16>*), align 2
; XOP-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
; XOP-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
; XOP-NEXT: ret void		; XOP-NEXT: ret void
;		;
%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2		%a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2		%a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2		%a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2		%a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2		%a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2		%a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
▲ Show 20 Lines • Show All 119 Lines • ▼ Show 20 Lines	;
store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2		store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2		store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2		store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
ret void		ret void
}		}

define void @shl_v64i8() {		define void @shl_v64i8() {
; CHECK-LABEL: @shl_v64i8(		; CHECK-LABEL: @shl_v64i8(
; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP1:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP2:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP2:%.]] = load <64 x i8>, <64 x i8> bitcast ([64 x i8]* @b8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1		; CHECK-NEXT: [[TMP3:%.*]] = shl <64 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1		; CHECK-NEXT: store <64 x i8> [[TMP3]], <64 x i8>* bitcast ([64 x i8]* @c8 to <64 x i8>*), align 1
; CHECK-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP6:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP8:%.]] = load <16 x i8>, <16 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]]
; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1		%a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1		%a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1		%a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1		%a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1		%a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1		%a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
▲ Show 20 Lines • Show All 252 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/sitofp.ll

Show First 20 Lines • Show All 174 Lines • ▼ Show 20 Lines
;		;
; AVX512-LABEL: @sitofp_8i64_8f64(		; AVX512-LABEL: @sitofp_8i64_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>		; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64		; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX512-NEXT: ret void		; AVX512-NEXT: ret void
;		;
; AVX256DQ-LABEL: @sitofp_8i64_8f64(		; AVX256DQ-LABEL: @sitofp_8i64_8f64(
; AVX256DQ-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64		; AVX256DQ-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
; AVX256DQ-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32		; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
; AVX256DQ-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>		; AVX256DQ-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX256DQ-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
; AVX256DQ-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; AVX256DQ-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
; AVX256DQ-NEXT: ret void		; AVX256DQ-NEXT: ret void
;		;
%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64		%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8		%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
%ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16		%ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
%ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8		%ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
%ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32		%ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
%ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8		%ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
▲ Show 20 Lines • Show All 97 Lines • ▼ Show 20 Lines
; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16		; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32		; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8		; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16		; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8		; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX256-LABEL: @sitofp_8i32_8f64(		; AVX-LABEL: @sitofp_8i32_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64		; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16		; AVX-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double>
; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>		; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double>		; AVX-NEXT: ret void
; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @sitofp_8i32_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double>
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64		%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4		%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8		%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4		%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16		%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4		%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8		%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines
; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16		; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32		; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8		; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16		; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8		; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX256-LABEL: @sitofp_8i16_8f64(		; AVX-LABEL: @sitofp_8i16_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x i16>, <4 x i16> bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64		; AVX-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <4 x i16>, <4 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8		; AVX-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double>
; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>		; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double>		; AVX-NEXT: ret void
; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @sitofp_8i16_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double>
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64		%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4		%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8		%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4		%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines
; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16		; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32		; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8		; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16		; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8		; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX256-LABEL: @sitofp_8i8_8f64(		; AVX-LABEL: @sitofp_8i8_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x i8>, <4 x i8> bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64		; AVX-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4		; AVX-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double>
; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>		; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double>		; AVX-NEXT: ret void
; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @sitofp_8i8_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double>
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64		%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2		%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4		%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2		%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
▲ Show 20 Lines • Show All 210 Lines • ▼ Show 20 Lines	;
store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64		store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4		store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8		store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4		store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @sitofp_8i32_8f32() #0 {		define void @sitofp_8i32_8f32() #0 {
; SSE-LABEL: @sitofp_8i32_8f32(		; CHECK-LABEL: @sitofp_8i32_8f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16		; CHECK-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>		; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
; SSE-NEXT: ret void
;
; AVX-LABEL: @sitofp_8i32_8f32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; AVX-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64		%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4		%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8		%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4		%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16		%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4		%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8		%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
Show All 13 Lines	;
store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16		store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4		store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8		store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4		store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
ret void		ret void
}		}

define void @sitofp_16i32_16f32() #0 {		define void @sitofp_16i32_16f32() #0 {
; SSE-LABEL: @sitofp_16i32_16f32(		; CHECK-LABEL: @sitofp_16i32_16f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16		; CHECK-NEXT: [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float>
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32		; CHECK-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
; SSE-NEXT: ret void
;
; AVX256-LABEL: @sitofp_16i32_16f32(
; AVX256-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
; AVX256-NEXT: [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float>
; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @sitofp_16i32_16f32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float>
; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64		%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64
%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4		%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4
%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2 ), align 8		%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2 ), align 8
%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3 ), align 4		%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3 ), align 4
%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4 ), align 16		%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4 ), align 16
%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5 ), align 4		%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5 ), align 4
%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6 ), align 8		%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6 ), align 8
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	;
store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64		store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4		store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8		store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4		store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @sitofp_8i16_8f32() #0 {		define void @sitofp_8i16_8f32() #0 {
; SSE-LABEL: @sitofp_8i16_8f32(		; CHECK-LABEL: @sitofp_8i16_8f32(
; SSE-NEXT: [[LD0:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; SSE-NEXT: [[LD1:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2		; CHECK-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
; SSE-NEXT: [[LD2:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4		; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; SSE-NEXT: [[LD3:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[LD4:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
; SSE-NEXT: [[LD5:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
; SSE-NEXT: [[LD6:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
; SSE-NEXT: [[LD7:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @sitofp_8i16_8f32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; AVX-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64		%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4		%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8		%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4		%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
Show All 13 Lines	;
store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16		store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4		store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8		store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4		store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
ret void		ret void
}		}

define void @sitofp_16i16_16f32() #0 {		define void @sitofp_16i16_16f32() #0 {
; SSE-LABEL: @sitofp_16i16_16f32(		; CHECK-LABEL: @sitofp_16i16_16f32(
; SSE-NEXT: [[LD0:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
; SSE-NEXT: [[LD1:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2		; CHECK-NEXT: [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float>
; SSE-NEXT: [[LD2:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4		; CHECK-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; SSE-NEXT: [[LD3:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[LD4:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
; SSE-NEXT: [[LD5:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
; SSE-NEXT: [[LD6:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
; SSE-NEXT: [[LD7:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
; SSE-NEXT: [[LD8:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
; SSE-NEXT: [[LD9:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
; SSE-NEXT: [[LD10:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
; SSE-NEXT: [[LD11:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
; SSE-NEXT: [[LD12:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
; SSE-NEXT: [[LD13:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
; SSE-NEXT: [[LD14:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
; SSE-NEXT: [[LD15:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
; SSE-NEXT: [[CVT8:%.*]] = sitofp i16 [[LD8]] to float
; SSE-NEXT: [[CVT9:%.*]] = sitofp i16 [[LD9]] to float
; SSE-NEXT: [[CVT10:%.*]] = sitofp i16 [[LD10]] to float
; SSE-NEXT: [[CVT11:%.*]] = sitofp i16 [[LD11]] to float
; SSE-NEXT: [[CVT12:%.*]] = sitofp i16 [[LD12]] to float
; SSE-NEXT: [[CVT13:%.*]] = sitofp i16 [[LD13]] to float
; SSE-NEXT: [[CVT14:%.*]] = sitofp i16 [[LD14]] to float
; SSE-NEXT: [[CVT15:%.*]] = sitofp i16 [[LD15]] to float
; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @sitofp_16i16_16f32(
; AVX256-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
; AVX256-NEXT: [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float>
; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @sitofp_16i16_16f32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float>
; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64		%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2		%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2
%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2 ), align 4		%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2 ), align 4
%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3 ), align 2		%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3 ), align 2
%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4 ), align 8		%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4 ), align 8
%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5 ), align 2		%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5 ), align 2
%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6 ), align 4		%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	;
store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64		store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4		store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8		store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4		store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @sitofp_8i8_8f32() #0 {		define void @sitofp_8i8_8f32() #0 {
; SSE-LABEL: @sitofp_8i8_8f32(		; CHECK-LABEL: @sitofp_8i8_8f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i8>, <4 x i8> bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>		; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
; SSE-NEXT: ret void
;
; AVX-LABEL: @sitofp_8i8_8f32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; AVX-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64		%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2		%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4		%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2		%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
Show All 13 Lines	;
store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16		store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4		store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8		store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4		store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
ret void		ret void
}		}

define void @sitofp_16i8_16f32() #0 {		define void @sitofp_16i8_16f32() #0 {
; SSE-LABEL: @sitofp_16i8_16f32(		; CHECK-LABEL: @sitofp_16i8_16f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i8>, <4 x i8> bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float>
; SSE-NEXT: [[TMP3:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8		; CHECK-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; SSE-NEXT: [[TMP4:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i8> [[TMP4]] to <4 x float>
; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
; SSE-NEXT: ret void
;
; AVX256-LABEL: @sitofp_16i8_16f32(
; AVX256-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <8 x i8>, <8 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
; AVX256-NEXT: [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float>
; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @sitofp_16i8_16f32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float>
; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64		%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64
%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1		%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1
%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2 ), align 2		%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2 ), align 2
%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3 ), align 1		%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3 ), align 1
%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4 ), align 4		%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4 ), align 4
%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5 ), align 1		%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5 ), align 1
%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6 ), align 2		%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 95 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/sqrt.ll

Show All 29 Lines	;
%sqrt0 = call double @llvm.sqrt.f64(double %a0)		%sqrt0 = call double @llvm.sqrt.f64(double %a0)
%sqrt1 = call double @llvm.sqrt.f64(double %a1)		%sqrt1 = call double @llvm.sqrt.f64(double %a1)
store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8		store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8		store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
ret void		ret void
}		}

define void @sqrt_4f64() #0 {		define void @sqrt_4f64() #0 {
; SSE-LABEL: @sqrt_4f64(		; CHECK-LABEL: @sqrt_4f64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 8		; CHECK-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
; SSE-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8		; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])		; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
; SSE-NEXT: ret void
;
; AVX-LABEL: @sqrt_4f64(
; AVX-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 8
; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
; AVX-NEXT: ret void
;		;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8		%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8		%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8		%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8		%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
%sqrt0 = call double @llvm.sqrt.f64(double %a0)		%sqrt0 = call double @llvm.sqrt.f64(double %a0)
%sqrt1 = call double @llvm.sqrt.f64(double %a1)		%sqrt1 = call double @llvm.sqrt.f64(double %a1)
%sqrt2 = call double @llvm.sqrt.f64(double %a2)		%sqrt2 = call double @llvm.sqrt.f64(double %a2)
%sqrt3 = call double @llvm.sqrt.f64(double %a3)		%sqrt3 = call double @llvm.sqrt.f64(double %a3)
store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8		store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8		store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
store double %sqrt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8		store double %sqrt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
store double %sqrt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		store double %sqrt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
ret void		ret void
}		}

define void @sqrt_8f64() #0 {		define void @sqrt_8f64() #0 {
; SSE-LABEL: @sqrt_8f64(		; CHECK-LABEL: @sqrt_8f64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x double>, <2 x double> bitcast ([8 x double]* @src64 to <2 x double>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[TMP1]])
; SSE-NEXT: [[TMP3:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4		; CHECK-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
; SSE-NEXT: [[TMP4:%.]] = load <2 x double>, <2 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4]])
; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @sqrt_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x double>, <4 x double> bitcast ([8 x double]* @src64 to <4 x double>*), align 4
; AVX256-NEXT: [[TMP2:%.]] = load <4 x double>, <4 x double> bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
; AVX256-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
; AVX256-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP2]])
; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @sqrt_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x double>, <8 x double> bitcast ([8 x double]* @src64 to <8 x double>*), align 4
; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[TMP1]])
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 4		%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 4
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 4		%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 4
%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 4		%a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 4
%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 4		%a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 4
%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 4		%a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 4
%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 4		%a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 4
%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 4		%a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 4
Show All 35 Lines	;
store float %sqrt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4		store float %sqrt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
store float %sqrt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4		store float %sqrt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
store float %sqrt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4		store float %sqrt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
store float %sqrt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4		store float %sqrt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @sqrt_8f32() #0 {		define void @sqrt_8f32() #0 {
; SSE-LABEL: @sqrt_8f32(		; CHECK-LABEL: @sqrt_8f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])		; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @sqrt_8f32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
; AVX-NEXT: ret void
;		;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4		%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4		%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4		%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4		%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4		%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4		%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4		%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
Show All 13 Lines	;
store float %sqrt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4		store float %sqrt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
store float %sqrt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4		store float %sqrt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
store float %sqrt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4		store float %sqrt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
store float %sqrt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4		store float %sqrt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
ret void		ret void
}		}

define void @sqrt_16f32() #0 {		define void @sqrt_16f32() #0 {
; SSE-LABEL: @sqrt_16f32(		; CHECK-LABEL: @sqrt_16f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> bitcast ([16 x float]* @src32 to <4 x float>*), align 4		; CHECK-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
; SSE-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[TMP1]])
; SSE-NEXT: [[TMP3:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4		; CHECK-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
; SSE-NEXT: [[TMP4:%.]] = load <4 x float>, <4 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])
; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4]])
; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @sqrt_16f32(
; AVX256-NEXT: [[TMP1:%.]] = load <8 x float>, <8 x float> bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; AVX256-NEXT: [[TMP2:%.]] = load <8 x float>, <8 x float> bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
; AVX256-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
; AVX256-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP2]])
; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @sqrt_16f32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x float>, <16 x float> bitcast ([16 x float]* @src32 to <16 x float>*), align 4
; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[TMP1]])
; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
; AVX512-NEXT: ret void
;		;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4		%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4		%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4		%a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4		%a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4		%a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4		%a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4		%a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

test/Transforms/SLPVectorizer/X86/stores_vectorize.ll

	Show First 20 Lines • Show All 86 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: [[ARRAYIDX4:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 9			; CHECK-NEXT: [[ARRAYIDX4:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 9
	; CHECK-NEXT: [[ARRAYIDX6:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 6			; CHECK-NEXT: [[ARRAYIDX6:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 6
	; CHECK-NEXT: [[ARRAYIDX7:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 2			; CHECK-NEXT: [[ARRAYIDX7:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 2
	; CHECK-NEXT: [[ARRAYIDX8:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 10			; CHECK-NEXT: [[ARRAYIDX8:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 10
	; CHECK-NEXT: [[ARRAYIDX10:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 5			; CHECK-NEXT: [[ARRAYIDX10:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 5
	; CHECK-NEXT: [[ARRAYIDX11:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 3			; CHECK-NEXT: [[ARRAYIDX11:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 3
	; CHECK-NEXT: [[TMP0:%.]] = bitcast i64 [[P3]] to <4 x i64>*			; CHECK-NEXT: [[TMP0:%.]] = bitcast i64 [[P3]] to <4 x i64>*
	; CHECK-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> [[TMP0]], align 8			; CHECK-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> [[TMP0]], align 8
	; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>			; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
	; CHECK-NEXT: [[ARRAYIDX12:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 11			; CHECK-NEXT: [[ARRAYIDX12:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 11
	; CHECK-NEXT: [[TMP3:%.]] = bitcast i64 [[ARRAYIDX1]] to <4 x i64>*			; CHECK-NEXT: [[TMP2:%.]] = bitcast i64 [[ARRAYIDX1]] to <4 x i64>*
	; CHECK-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> [[TMP3]], align 8			; CHECK-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> [[TMP2]], align 8
	; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>			; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
	; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP5]]			; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i64> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
	; CHECK-NEXT: [[ARRAYIDX14:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 4			; CHECK-NEXT: [[ARRAYIDX14:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 4
	; CHECK-NEXT: [[TMP7:%.]] = bitcast i64 [[ARRAYIDX14]] to <4 x i64>*			; CHECK-NEXT: [[TMP5:%.]] = bitcast i64 [[ARRAYIDX14]] to <4 x i64>*
	; CHECK-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* [[TMP7]], align 8			; CHECK-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	entry:			entry:
	%0 = load i64, i64* %p3, align 8			%0 = load i64, i64* %p3, align 8
	%arrayidx1 = getelementptr inbounds i64, i64* %p3, i64 8			%arrayidx1 = getelementptr inbounds i64, i64* %p3, i64 8
	%1 = load i64, i64* %arrayidx1, align 8			%1 = load i64, i64* %arrayidx1, align 8
	%shl = shl i64 %0, %1			%shl = shl i64 %0, %1
	%arrayidx2 = getelementptr inbounds i64, i64* %p3, i64 7			%arrayidx2 = getelementptr inbounds i64, i64* %p3, i64 7
	Show All 30 Lines
	; CHECK-NEXT: [[ADD_PTR:%.]] = getelementptr inbounds float, float [[P1:%.*]], i64 [[IDX_EXT]]			; CHECK-NEXT: [[ADD_PTR:%.]] = getelementptr inbounds float, float [[P1:%.*]], i64 [[IDX_EXT]]
	; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds float, float [[ADD_PTR]], i64 5			; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds float, float [[ADD_PTR]], i64 5
	; CHECK-NEXT: [[TMP0:%.]] = load float, float [[ARRAYIDX1]], align 4			; CHECK-NEXT: [[TMP0:%.]] = load float, float [[ARRAYIDX1]], align 4
	; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds float, float [[P4:%.*]], i64 3			; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds float, float [[P4:%.*]], i64 3
	; CHECK-NEXT: [[TMP1:%.]] = load float, float [[ARRAYIDX2]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load float, float [[ARRAYIDX2]], align 4
	; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]			; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
	; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4			; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4
	; CHECK-NEXT: [[ARRAYIDX4:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 1			; CHECK-NEXT: [[ARRAYIDX4:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 1
	; CHECK-NEXT: [[TMP2:%.]] = bitcast i64 [[P3]] to <2 x i64>*
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[TMP2]], align 8
	; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], <i64 5, i64 5>
	; CHECK-NEXT: [[TMP5:%.]] = bitcast i64 [[P3]] to <2 x i64>*
	; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
	; CHECK-NEXT: [[ARRAYIDX6:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 2			; CHECK-NEXT: [[ARRAYIDX6:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 2
	; CHECK-NEXT: [[TMP6:%.]] = load i64, i64 [[ARRAYIDX6]], align 8
	; CHECK-NEXT: [[SHR7:%.*]] = lshr i64 [[TMP6]], 5
	; CHECK-NEXT: store i64 [[SHR7]], i64* [[ARRAYIDX6]], align 8
	; CHECK-NEXT: [[ARRAYIDX8:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 3			; CHECK-NEXT: [[ARRAYIDX8:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 3
	; CHECK-NEXT: [[TMP7:%.]] = load i64, i64 [[ARRAYIDX8]], align 8			; CHECK-NEXT: [[TMP2:%.]] = bitcast i64 [[P3]] to <4 x i64>*
	; CHECK-NEXT: [[SHR9:%.*]] = lshr i64 [[TMP7]], 5			; CHECK-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> [[TMP2]], align 8
				; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
	; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 5			; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 5
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8			; CHECK-NEXT: [[TMP5:%.]] = bitcast i64 [[P3]] to <4 x i64>*
				; CHECK-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	entry:			entry:
	store i64 5, i64* %p3, align 8			store i64 5, i64* %p3, align 8
	%idx.ext = sext i32 %p2 to i64			%idx.ext = sext i32 %p2 to i64
	%add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext			%add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
	%arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5			%arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
	%0 = load float, float* %arrayidx1, align 4			%0 = load float, float* %arrayidx1, align 4
	▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: [[ADD_PTR:%.]] = getelementptr inbounds float, float [[P1:%.*]], i64 [[IDX_EXT]]			; CHECK-NEXT: [[ADD_PTR:%.]] = getelementptr inbounds float, float [[P1:%.*]], i64 [[IDX_EXT]]
	; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds float, float [[ADD_PTR]], i64 5			; CHECK-NEXT: [[ARRAYIDX1:%.]] = getelementptr inbounds float, float [[ADD_PTR]], i64 5
	; CHECK-NEXT: [[TMP0:%.]] = load float, float [[ARRAYIDX1]], align 4			; CHECK-NEXT: [[TMP0:%.]] = load float, float [[ARRAYIDX1]], align 4
	; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds float, float [[P4:%.*]], i64 3			; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds float, float [[P4:%.*]], i64 3
	; CHECK-NEXT: [[TMP1:%.]] = load float, float [[ARRAYIDX2]], align 4			; CHECK-NEXT: [[TMP1:%.]] = load float, float [[ARRAYIDX2]], align 4
	; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]			; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
	; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4			; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4
	; CHECK-NEXT: [[ARRAYIDX4:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 1			; CHECK-NEXT: [[ARRAYIDX4:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 1
	; CHECK-NEXT: [[TMP2:%.]] = bitcast i64 [[P3]] to <2 x i64>*
	; CHECK-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[TMP2]], align 8
	; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], <i64 5, i64 5>
	; CHECK-NEXT: [[TMP5:%.]] = bitcast i64 [[P3]] to <2 x i64>*
	; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
	; CHECK-NEXT: [[ARRAYIDX6:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 2			; CHECK-NEXT: [[ARRAYIDX6:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 2
	; CHECK-NEXT: [[TMP6:%.]] = load i64, i64 [[ARRAYIDX6]], align 8
	; CHECK-NEXT: [[SHR7:%.*]] = lshr i64 [[TMP6]], 5
	; CHECK-NEXT: store i64 [[SHR7]], i64* [[ARRAYIDX6]], align 8
	; CHECK-NEXT: [[ARRAYIDX8:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 3			; CHECK-NEXT: [[ARRAYIDX8:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 3
	; CHECK-NEXT: [[TMP7:%.]] = load i64, i64 [[ARRAYIDX8]], align 8			; CHECK-NEXT: [[TMP2:%.]] = bitcast i64 [[P3]] to <4 x i64>*
	; CHECK-NEXT: [[SHR9:%.*]] = lshr i64 [[TMP7]], 5			; CHECK-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> [[TMP2]], align 8
				; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
	; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 5			; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds i64, i64 [[P3]], i64 5
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8			; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8
	; CHECK-NEXT: store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8			; CHECK-NEXT: [[TMP5:%.]] = bitcast i64 [[P3]] to <4 x i64>*
				; CHECK-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	entry:			entry:
	store i64 5, i64* %p3, align 8			store i64 5, i64* %p3, align 8
	%idx.ext = sext i32 %p2 to i64			%idx.ext = sext i32 %p2 to i64
	%add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext			%add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
	%arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5			%arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
	%0 = load float, float* %arrayidx1, align 4			%0 = load float, float* %arrayidx1, align 4
	Show All 39 Lines

test/Transforms/SLPVectorizer/X86/uitofp.ll

Show All 31 Lines	;
%cvt0 = uitofp i64 %ld0 to double		%cvt0 = uitofp i64 %ld0 to double
%cvt1 = uitofp i64 %ld1 to double		%cvt1 = uitofp i64 %ld1 to double
store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64		store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8		store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
ret void		ret void
}		}

define void @uitofp_4i64_4f64() #0 {		define void @uitofp_4i64_4f64() #0 {
; SSE-LABEL: @uitofp_4i64_4f64(		; CHECK-LABEL: @uitofp_4i64_4f64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16		; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>		; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>		; CHECK-NEXT: ret void
; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
; SSE-NEXT: ret void
;
; AVX-LABEL: @uitofp_4i64_4f64(
; AVX-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
; AVX-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; AVX-NEXT: ret void
;		;
%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64		%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8		%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
%ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16		%ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
%ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8		%ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
%cvt0 = uitofp i64 %ld0 to double		%cvt0 = uitofp i64 %ld0 to double
%cvt1 = uitofp i64 %ld1 to double		%cvt1 = uitofp i64 %ld1 to double
%cvt2 = uitofp i64 %ld2 to double		%cvt2 = uitofp i64 %ld2 to double
%cvt3 = uitofp i64 %ld3 to double		%cvt3 = uitofp i64 %ld3 to double
store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64		store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8		store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16		store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
ret void		ret void
}		}

define void @uitofp_8i64_8f64() #0 {		define void @uitofp_8i64_8f64() #0 {
; SSE-LABEL: @uitofp_8i64_8f64(		; CHECK-LABEL: @uitofp_8i64_8f64(
; SSE-NEXT: [[TMP1:%.]] = load <2 x i64>, <2 x i64> bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16		; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double>
; SSE-NEXT: [[TMP3:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32		; CHECK-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; SSE-NEXT: [[TMP4:%.]] = load <2 x i64>, <2 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i64> [[TMP4]] to <2 x double>
; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
; SSE-NEXT: ret void
;
; AVX256-LABEL: @uitofp_8i64_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
; AVX256-NEXT: [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @uitofp_8i64_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double>
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64		%ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8		%ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
%ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16		%ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
%ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8		%ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
%ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32		%ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
%ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8		%ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
%ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16		%ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
▲ Show 20 Lines • Show All 117 Lines • ▼ Show 20 Lines
; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16		; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32		; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8		; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16		; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8		; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX256-LABEL: @uitofp_8i32_8f64(		; AVX-LABEL: @uitofp_8i32_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64		; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16		; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
; AVX256-NEXT: [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>		; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x double>		; AVX-NEXT: ret void
; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @uitofp_8i32_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64		%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4		%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8		%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4		%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16		%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4		%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8		%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines
; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16		; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32		; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8		; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16		; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8		; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX256-LABEL: @uitofp_8i16_8f64(		; AVX-LABEL: @uitofp_8i16_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x i16>, <4 x i16> bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64		; AVX-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <4 x i16>, <4 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8		; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double>
; AVX256-NEXT: [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>		; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x double>		; AVX-NEXT: ret void
; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @uitofp_8i16_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double>
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64		%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4		%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8		%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4		%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
▲ Show 20 Lines • Show All 117 Lines • ▼ Show 20 Lines
; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16		; SSE-NEXT: store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8		; SSE-NEXT: store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32		; SSE-NEXT: store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8		; SSE-NEXT: store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16		; SSE-NEXT: store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8		; SSE-NEXT: store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
; SSE-NEXT: ret void		; SSE-NEXT: ret void
;		;
; AVX256-LABEL: @uitofp_8i8_8f64(		; AVX-LABEL: @uitofp_8i8_8f64(
; AVX256-NEXT: [[TMP1:%.]] = load <4 x i8>, <4 x i8> bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64		; AVX-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4		; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double>
; AVX256-NEXT: [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>		; AVX-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x double>		; AVX-NEXT: ret void
; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @uitofp_8i8_8f64(
; AVX512-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double>
; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64		%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2		%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4		%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2		%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
▲ Show 20 Lines • Show All 210 Lines • ▼ Show 20 Lines	;
store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64		store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4		store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8		store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4		store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @uitofp_8i32_8f32() #0 {		define void @uitofp_8i32_8f32() #0 {
; SSE-LABEL: @uitofp_8i32_8f32(		; CHECK-LABEL: @uitofp_8i32_8f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16		; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>		; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
; SSE-NEXT: ret void
;
; AVX-LABEL: @uitofp_8i32_8f32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64		%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4		%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8		%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4		%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16		%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4		%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8		%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
Show All 13 Lines	;
store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16		store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4		store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8		store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4		store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
ret void		ret void
}		}

define void @uitofp_16i32_16f32() #0 {		define void @uitofp_16i32_16f32() #0 {
; SSE-LABEL: @uitofp_16i32_16f32(		; CHECK-LABEL: @uitofp_16i32_16f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16		; CHECK-NEXT: [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float>
; SSE-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32		; CHECK-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; SSE-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP5:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
; SSE-NEXT: [[TMP7:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i32> [[TMP4]] to <4 x float>
; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
; SSE-NEXT: ret void
;
; AVX256-LABEL: @uitofp_16i32_16f32(
; AVX256-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
; AVX256-NEXT: [[TMP3:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
; AVX256-NEXT: [[TMP4:%.*]] = uitofp <8 x i32> [[TMP2]] to <8 x float>
; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @uitofp_16i32_16f32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float>
; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64		%ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64
%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4		%ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4
%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2 ), align 8		%ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2 ), align 8
%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3 ), align 4		%ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3 ), align 4
%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4 ), align 16		%ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4 ), align 16
%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5 ), align 4		%ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5 ), align 4
%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6 ), align 8		%ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6 ), align 8
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	;
store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64		store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4		store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8		store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4		store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @uitofp_8i16_8f32() #0 {		define void @uitofp_8i16_8f32() #0 {
; SSE-LABEL: @uitofp_8i16_8f32(		; CHECK-LABEL: @uitofp_8i16_8f32(
; SSE-NEXT: [[LD0:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; SSE-NEXT: [[LD1:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2		; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
; SSE-NEXT: [[LD2:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4		; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; SSE-NEXT: [[LD3:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[LD4:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
; SSE-NEXT: [[LD5:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
; SSE-NEXT: [[LD6:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
; SSE-NEXT: [[LD7:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @uitofp_8i16_8f32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64		%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2		%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4		%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2		%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8		%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2		%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4		%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
Show All 13 Lines	;
store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16		store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4		store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8		store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4		store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
ret void		ret void
}		}

define void @uitofp_16i16_16f32() #0 {		define void @uitofp_16i16_16f32() #0 {
; SSE-LABEL: @uitofp_16i16_16f32(		; CHECK-LABEL: @uitofp_16i16_16f32(
; SSE-NEXT: [[LD0:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
; SSE-NEXT: [[LD1:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2		; CHECK-NEXT: [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float>
; SSE-NEXT: [[LD2:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4		; CHECK-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; SSE-NEXT: [[LD3:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2		; CHECK-NEXT: ret void
; SSE-NEXT: [[LD4:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
; SSE-NEXT: [[LD5:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
; SSE-NEXT: [[LD6:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
; SSE-NEXT: [[LD7:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
; SSE-NEXT: [[LD8:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
; SSE-NEXT: [[LD9:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
; SSE-NEXT: [[LD10:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
; SSE-NEXT: [[LD11:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
; SSE-NEXT: [[LD12:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
; SSE-NEXT: [[LD13:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
; SSE-NEXT: [[LD14:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
; SSE-NEXT: [[LD15:%.]] = load i16, i16 getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
; SSE-NEXT: [[CVT8:%.*]] = uitofp i16 [[LD8]] to float
; SSE-NEXT: [[CVT9:%.*]] = uitofp i16 [[LD9]] to float
; SSE-NEXT: [[CVT10:%.*]] = uitofp i16 [[LD10]] to float
; SSE-NEXT: [[CVT11:%.*]] = uitofp i16 [[LD11]] to float
; SSE-NEXT: [[CVT12:%.*]] = uitofp i16 [[LD12]] to float
; SSE-NEXT: [[CVT13:%.*]] = uitofp i16 [[LD13]] to float
; SSE-NEXT: [[CVT14:%.*]] = uitofp i16 [[LD14]] to float
; SSE-NEXT: [[CVT15:%.*]] = uitofp i16 [[LD15]] to float
; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @uitofp_16i16_16f32(
; AVX256-NEXT: [[TMP1:%.]] = load <8 x i16>, <8 x i16> bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
; AVX256-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
; AVX256-NEXT: [[TMP4:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x float>
; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @uitofp_16i16_16f32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i16>, <16 x i16> bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float>
; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64		%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2		%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2
%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2 ), align 4		%ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2 ), align 4
%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3 ), align 2		%ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3 ), align 2
%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4 ), align 8		%ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4 ), align 8
%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5 ), align 2		%ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5 ), align 2
%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6 ), align 4		%ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	;
store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64		store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4		store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8		store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4		store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
ret void		ret void
}		}

define void @uitofp_8i8_8f32() #0 {		define void @uitofp_8i8_8f32() #0 {
; SSE-LABEL: @uitofp_8i8_8f32(		; CHECK-LABEL: @uitofp_8i8_8f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i8>, <4 x i8> bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>		; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>		; CHECK-NEXT: ret void
; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
; SSE-NEXT: ret void
;
; AVX-LABEL: @uitofp_8i8_8f32(
; AVX-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64		%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1		%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2		%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1		%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4		%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1		%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2		%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
Show All 13 Lines	;
store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16		store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4		store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8		store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4		store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
ret void		ret void
}		}

define void @uitofp_16i8_16f32() #0 {		define void @uitofp_16i8_16f32() #0 {
; SSE-LABEL: @uitofp_16i8_16f32(		; CHECK-LABEL: @uitofp_16i8_16f32(
; SSE-NEXT: [[TMP1:%.]] = load <4 x i8>, <4 x i8> bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64		; CHECK-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
; SSE-NEXT: [[TMP2:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4		; CHECK-NEXT: [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float>
; SSE-NEXT: [[TMP3:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8		; CHECK-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; SSE-NEXT: [[TMP4:%.]] = load <4 x i8>, <4 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4		; CHECK-NEXT: ret void
; SSE-NEXT: [[TMP5:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
; SSE-NEXT: [[TMP7:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i8> [[TMP4]] to <4 x float>
; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
; SSE-NEXT: ret void
;
; AVX256-LABEL: @uitofp_16i8_16f32(
; AVX256-NEXT: [[TMP1:%.]] = load <8 x i8>, <8 x i8> bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
; AVX256-NEXT: [[TMP2:%.]] = load <8 x i8>, <8 x i8> bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
; AVX256-NEXT: [[TMP3:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
; AVX256-NEXT: [[TMP4:%.*]] = uitofp <8 x i8> [[TMP2]] to <8 x float>
; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
; AVX256-NEXT: ret void
;
; AVX512-LABEL: @uitofp_16i8_16f32(
; AVX512-NEXT: [[TMP1:%.]] = load <16 x i8>, <16 x i8> bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
; AVX512-NEXT: [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float>
; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
; AVX512-NEXT: ret void
;		;
%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64		%ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64
%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1		%ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1
%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2 ), align 2		%ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2 ), align 2
%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3 ), align 1		%ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3 ), align 1
%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4 ), align 4		%ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4 ), align 4
%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5 ), align 1		%ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5 ), align 1
%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6 ), align 2		%ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6 ), align 2
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[SLP] Generalization of stores vectorization.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 221573

include/llvm/Transforms/Vectorize/SLPVectorizer.h

lib/Transforms/Vectorize/SLPVectorizer.cpp

test/Transforms/LoopVectorize/X86/metadata-enable.ll

test/Transforms/SLPVectorizer/AArch64/matmul.ll

test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll

test/Transforms/SLPVectorizer/X86/arith-add-usat.ll

test/Transforms/SLPVectorizer/X86/arith-add.ll

test/Transforms/SLPVectorizer/X86/arith-fix.ll

test/Transforms/SLPVectorizer/X86/arith-mul.ll

test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll

test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

test/Transforms/SLPVectorizer/X86/arith-sub.ll

test/Transforms/SLPVectorizer/X86/bitreverse.ll

test/Transforms/SLPVectorizer/X86/bswap.ll

test/Transforms/SLPVectorizer/X86/cast.ll

test/Transforms/SLPVectorizer/X86/ctlz.ll

test/Transforms/SLPVectorizer/X86/ctpop.ll

test/Transforms/SLPVectorizer/X86/cttz.ll

test/Transforms/SLPVectorizer/X86/fabs.ll

test/Transforms/SLPVectorizer/X86/fcopysign.ll

test/Transforms/SLPVectorizer/X86/fma.ll

test/Transforms/SLPVectorizer/X86/fptosi.ll

test/Transforms/SLPVectorizer/X86/fptoui.ll

test/Transforms/SLPVectorizer/X86/fround.ll

test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll

test/Transforms/SLPVectorizer/X86/pr19657.ll

test/Transforms/SLPVectorizer/X86/pr35497.ll

test/Transforms/SLPVectorizer/X86/shift-ashr.ll

test/Transforms/SLPVectorizer/X86/shift-lshr.ll

test/Transforms/SLPVectorizer/X86/shift-shl.ll

test/Transforms/SLPVectorizer/X86/sitofp.ll

test/Transforms/SLPVectorizer/X86/sqrt.ll

test/Transforms/SLPVectorizer/X86/stores_vectorize.ll

test/Transforms/SLPVectorizer/X86/uitofp.ll

[SLP] Generalization of stores vectorization.
ClosedPublic