Diff 113978

lib/Target/X86/X86InterleavedAccess.cpp

Show First 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	class X86InterleavedAccessGroup {
/// Out-V1 = p2, q2, r2, s2		/// Out-V1 = p2, q2, r2, s2
/// Out-V2 = p3, q3, r3, s3		/// Out-V2 = p3, q3, r3, s3
/// Out-V3 = P4, q4, r4, s4		/// Out-V3 = P4, q4, r4, s4
void transpose_4x4(ArrayRef<Instruction *> InputVectors,		void transpose_4x4(ArrayRef<Instruction *> InputVectors,
SmallVectorImpl<Value *> &TransposedMatrix);		SmallVectorImpl<Value *> &TransposedMatrix);
void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,		void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
SmallVectorImpl<Value *> &TransposedMatrix,		SmallVectorImpl<Value *> &TransposedMatrix,
unsigned NumSubVecElems);		unsigned NumSubVecElems);
		void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
		SmallVectorImpl<Value *> &TransposedMatrix,
		unsigned NumSubVecElems);

public:		public:
/// In order to form an interleaved access group X86InterleavedAccessGroup		/// In order to form an interleaved access group X86InterleavedAccessGroup
/// requires a wide-load instruction \p 'I', a group of interleaved-vectors		/// requires a wide-load instruction \p 'I', a group of interleaved-vectors
/// \p Shuffs, reference to the first indices of each interleaved-vector		/// \p Shuffs, reference to the first indices of each interleaved-vector
/// \p 'Ind' and the interleaving stride factor \p F. In order to generate		/// \p 'Ind' and the interleaving stride factor \p F. In order to generate
/// X86-specific instructions/intrinsics it also requires the underlying		/// X86-specific instructions/intrinsics it also requires the underlying
/// target information \p STarget.		/// target information \p STarget.
Show All 15 Lines
};		};
} // end anonymous namespace		} // end anonymous namespace

bool X86InterleavedAccessGroup::isSupported() const {		bool X86InterleavedAccessGroup::isSupported() const {
VectorType *ShuffleVecTy = Shuffles[0]->getType();		VectorType *ShuffleVecTy = Shuffles[0]->getType();
Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();		Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);		unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
unsigned WideInstSize;		unsigned WideInstSize;

// Currently, lowering is supported for the following vectors with stride 4:		// Currently, lowering is supported for the following vectors:
		guyblankUnsubmitted Done Reply Inline Actions update comment please guyblank: update comment please
		// Stride 4:
// 1. Store and load of 4-element vectors of 64 bits on AVX.		// 1. Store and load of 4-element vectors of 64 bits on AVX.
// 2. Store of 16/32-element vectors of 8 bits on AVX.		// 2. Store of 16/32-element vectors of 8 bits on AVX.
if (!Subtarget.hasAVX() \|\| Factor != 4)		// Stride 3:
		// 1. Load of 8/16/32-element vecotrs of 8 bits on AVX.
		if (!Subtarget.hasAVX() \|\| (Factor != 4 && Factor != 3))
return false;		return false;

if (isa<LoadInst>(Inst)) {		if (isa<LoadInst>(Inst)) {
WideInstSize = DL.getTypeSizeInBits(Inst->getType());		WideInstSize = DL.getTypeSizeInBits(Inst->getType());
} else		} else
WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());		WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());

// We support shuffle represents stride 4 for byte type with size of		// We support shuffle represents stride 4 for byte type with size of
// WideInstSize.		// WideInstSize.
if (ShuffleElemSize == 64 && WideInstSize == 1024)		if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
return true;		return true;

if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) &&		if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
(WideInstSize == 256 \|\| WideInstSize == 512 \|\| WideInstSize == 1024))		(WideInstSize == 256 \|\| WideInstSize == 512 \|\| WideInstSize == 1024))
return true;		return true;

		if (ShuffleElemSize == 8 && isa<LoadInst>(Inst) && Factor == 3 &&
		guyblankUnsubmitted Done Reply Inline Actions should this be just for factor == 3 ? if so, should the code above also change to be only for factor == 4 ? guyblank: should this be just for factor == 3 ? if so, should the code above also change to be only for…
		(WideInstSize == 192 \|\| WideInstSize == 384 \|\| WideInstSize == 768))
		return true;

return false;		return false;
}		}

void X86InterleavedAccessGroup::decompose(		void X86InterleavedAccessGroup::decompose(
Instruction VecInst, unsigned NumSubVectors, VectorType SubVecTy,		Instruction VecInst, unsigned NumSubVectors, VectorType SubVecTy,
SmallVectorImpl<Instruction *> &DecomposedVectors) {		SmallVectorImpl<Instruction *> &DecomposedVectors) {

assert((isa<LoadInst>(VecInst) \|\| isa<ShuffleVectorInst>(VecInst)) &&		assert((isa<LoadInst>(VecInst) \|\| isa<ShuffleVectorInst>(VecInst)) &&
Show All 18 Lines	for (unsigned i = 0; i < NumSubVectors; ++i)
createSequentialMask(Builder, Indices[i],		createSequentialMask(Builder, Indices[i],
SubVecTy->getVectorNumElements(), 0))));		SubVecTy->getVectorNumElements(), 0))));
return;		return;
}		}

// Decompose the load instruction.		// Decompose the load instruction.
LoadInst *LI = cast<LoadInst>(VecInst);		LoadInst *LI = cast<LoadInst>(VecInst);
Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());		Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
Value *VecBasePtr =		Value *VecBasePtr;
Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);		unsigned int NumLoads = NumSubVectors;
		zviUnsubmitted Done Reply Inline Actions NumOfLoads or NumLoads zvi: NumOfLoads or NumLoads
		// In the case of stride 3 with a vector of 32 elements load the information
		guyblankUnsubmitted Done Reply Inline Actions can you explain what's happening here? guyblank: can you explain what's happening here?
		// in the following way:
		// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
		if (DL.getTypeSizeInBits(VecTy) == 768) {
		Type *VecTran =
		VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
		VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
		NumLoads = NumSubVectors * 2;
		} else
		VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
// Generate N loads of T type.		// Generate N loads of T type.
for (unsigned i = 0; i < NumSubVectors; i++) {		for (unsigned i = 0; i < NumLoads; i++) {
// TODO: Support inbounds GEP.		// TODO: Support inbounds GEP.
Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));		Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
Instruction *NewLoad =		Instruction *NewLoad =
Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());		Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
DecomposedVectors.push_back(NewLoad);		DecomposedVectors.push_back(NewLoad);
}		}
}		}

▲ Show 20 Lines • Show All 139 Lines • ▼ Show 20 Lines	void X86InterleavedAccessGroup::interleave8bitStride4(
// cmyk24 cmyk25 cmyk26 cmyk27 \| cmyk28 cmyk29 cmyk30 cmyk31		// cmyk24 cmyk25 cmyk26 cmyk27 \| cmyk28 cmyk29 cmyk30 cmyk31

TransposedMatrix[0] = Builder.CreateShuffleVector(Low, High, ConcatLow);		TransposedMatrix[0] = Builder.CreateShuffleVector(Low, High, ConcatLow);
TransposedMatrix[1] = Builder.CreateShuffleVector(Low1, High1, ConcatLow);		TransposedMatrix[1] = Builder.CreateShuffleVector(Low1, High1, ConcatLow);
TransposedMatrix[2] = Builder.CreateShuffleVector(Low, High, ConcatHigh);		TransposedMatrix[2] = Builder.CreateShuffleVector(Low, High, ConcatHigh);
TransposedMatrix[3] = Builder.CreateShuffleVector(Low1, High1, ConcatHigh);		TransposedMatrix[3] = Builder.CreateShuffleVector(Low1, High1, ConcatHigh);
}		}

		// createShuffleStride returns shuffle mask of size N.
		// The shuffle pattern is as following :
		// {0, Stride%(VF/Lane), (2Stride%(VF/Lane))...(VFStride/Lane)%(VF/Lane),
		// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
		// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
		// Where Lane is the # of lanes in a register:
		// VectorSize = 128 => Lane = 1
		// VectorSize = 256 => Lane = 2
		// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
		// {<[0\|3\|6\|1\|4\|7\|2\|5]-[8\|11\|14\|9\|12\|15\|10\|13]>}
		static void createShuffleStride(MVT VT, int Stride,
		zviUnsubmitted Done Reply Inline Actions Please check function name zvi: Please check function name
		SmallVectorImpl<uint32_t> &Mask) {
		int VectorSize = VT.getSizeInBits();
		zviUnsubmitted Not Done Reply Inline Actions On second thought no need for the max() with assert(vectorSize >= 128) zvi: On second thought no need for the max() with assert(vectorSize >= 128)
		m_zuckermanAuthorUnsubmitted Not Done Reply Inline Actions You will need it for the VF8 since the VectoreSize is 64 and 64/128 is a zero and not 1. m_zuckerman: You will need it for the VF8 since the VectoreSize is 64 and 64/128 is a zero and not 1.
		int VF = VT.getVectorNumElements();
		zviUnsubmitted Done Reply Inline Actions drop the brackets? zvi: drop the brackets?
		zviUnsubmitted Done Reply Inline Actions Maybe swap names of variables Lane and LaneCount? zvi: Maybe swap names of variables Lane and LaneCount?
		int LaneCount = std::max(VectorSize / 128, 1);
		zviUnsubmitted Done Reply Inline Actions int i = 0, e = VF / Lane; i != e; ++i zvi: int i = 0, e = VF / Lane; i != e; ++i
		zviUnsubmitted Done Reply Inline Actions these brackets can be dropped zvi: these brackets can be dropped
		for (int Lane = 0; Lane < LaneCount; Lane++)
		zviUnsubmitted Done Reply Inline Actions Extra brackets zvi: Extra brackets
		zviUnsubmitted Done Reply Inline Actions I meant these brackets: (LaneSize) zvi: I meant these brackets: (LaneSize)
		for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
		Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
		}

		zviUnsubmitted Done Reply Inline Actions Please consider something like this: // getGroupElemCount - returns the size in elements of group index 'Group' in mask 'Mask'. // A mask contains exactly 'Stride' groups, where each group is a monotonically increasing sequence with stride 'Stride'. // In these examples the interested group is marked between '[...]' // getGroupElemCount(3, 0, {0\|3\|6\|1\|4\|7\|[2\|5]}) -> 2 // getGroupElemCount(3, 1, {0\|3\|6\|[1\|4\|7]\|2\|5}) -> 3 zvi: Please consider something like this: ``` // getGroupElemCount - returns the size in elements…
		// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
		// inside mask a shuffleMask. A mask contains exactly 3 groups, where
		// each group is a monotonically increasing sequence with stride 3.
		// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
		zviUnsubmitted Done Reply Inline Actions wide -> width or size zvi: wide -> width or size
		static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
		zviUnsubmitted Done Reply Inline Actions Please try to make this function more efficient. Given Stride and VF you can compute the offset of each first/last group member and compute the first/last of the successor group etc. zvi: Please try to make this function more efficient. Given Stride and VF you can compute the offset…
		int VectorSize = VT.getSizeInBits();
		zviUnsubmitted Not Done Reply Inline Actions Please check identation zvi: Please check identation
		int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
		for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
		zviUnsubmitted Done Reply Inline Actions The return value may indeed by used as a shift amount, but if this function returns the element count of the i-th group, maybe better name it something like 'getGroupElemCount' zvi: The return value may indeed by used as a shift amount, but if this function returns the element…
		zviUnsubmitted Done Reply Inline Actions No need to define GroupSize at loop scope. it can be moved to loop body. zvi: No need to define GroupSize at loop scope. it can be moved to loop body.
		int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
		zviUnsubmitted Done Reply Inline Actions ArrayRef<uint32_t> Mask zvi: ArrayRef<uint32_t> Mask
		SizeInfo.push_back(GroupSize);
		zviUnsubmitted Done Reply Inline Actions Initialize Index here zvi: Initialize Index here
		zviUnsubmitted Done Reply Inline Actions GroupWide -> GroupWidth zvi: GroupWide -> GroupWidth
		zviUnsubmitted Done Reply Inline Actions Mask.back() % Stride zvi: Mask.back() % Stride
		FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
		zviUnsubmitted Done Reply Inline Actions for (int i = 0, Count = 0, GroupNumber = 0, e = VF / LaneCount; i != e; ++i) zvi: for (int i = 0, Count = 0, GroupNumber = 0, e = VF / LaneCount; i != e; ++i)
		}
		}
		zviUnsubmitted Not Done Reply Inline Actions I might be missing something, but is group number 0 defined to be the group starting at the end of Mask? zvi: I might be missing something, but is group number 0 defined to be the group starting at the end…
		m_zuckermanAuthorUnsubmitted Not Done Reply Inline Actions Group number zero ( According to the modulo operation) will always lie at the first element of the shuffle since we begin with zero.For example, think about a vector length 8 with stride 3. The shuffle sequence will look like that [0,3,6,1,4,7,2,5] so we don't know the last group but we can know what is the first element since we begin the shuffle from zero. m_zuckerman: Group number zero ( According to the modulo operation) will always lie at the first element of…
		zviUnsubmitted Not Done Reply Inline Actions I see in the examples above that the first group consists of elements that are the last indices of Mask, and the search starts from the last index. zvi: I see in the examples above that the first group consists of elements that are the last indices…

		// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
		// vpalign works according to lanes
		zviUnsubmitted Not Done Reply Inline Actions Size is not really needed as you can update SizeInfo on the fly zvi: Size is not really needed as you can update SizeInfo on the fly
		// Where Lane is the # of lanes in a register:
		// VectorWide = 128 => Lane = 1
		// VectorWide = 256 => Lane = 2
		// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
		guyblankUnsubmitted Done Reply Inline Actions typo: vpaling -> vpalign guyblank: typo: vpaling -> vpalign
		// For Lane = 2 shuffle pattern is:
		// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
		// Imm variable sets the offset amount. The result of the
		// function is stored inside ShuffleMask vector and it built as described in
		// the begin of the description. AlignDirection is a boolean that indecat the
		guyblankUnsubmitted Done Reply Inline Actions should be {DiffToJump, ..., DiffToJump + VF - 1} ? guyblank: should be {DiffToJump, ..., DiffToJump + VF - 1} ?
		// direction of the alignment. (false - align to the "right" side while true -
		// align to the "left" side)
		static void DecodePALIGNRMask(MVT VT, unsigned Imm,
		SmallVectorImpl<uint32_t> &ShuffleMask,
		bool AlignDirection = true, bool Unary = false) {
		zviUnsubmitted Not Done Reply Inline Actions Please check formatting: need spaces around the '=' zvi: Please check formatting: need spaces around the '='

		guyblankUnsubmitted Done Reply Inline Actions typos, should be: AlignBegin is a boolean that indicates the direction of the alignment guyblank: typos, should be: AlignBegin is a boolean that indicates the direction of the alignment
		unsigned NumElts = VT.getVectorNumElements();
		guyblankUnsubmitted Done Reply Inline Actions typo: rigth -> right plus, on of these should be "left" and maybe something ilke AlignDirection would be a better name? guyblank: typo: rigth -> right plus, on of these should be "left" and maybe something ilke AlignDirection…
		unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
		guyblankUnsubmitted Done Reply Inline Actions are you sure you need this function? can you use DecodePALIGNRMask ? guyblank: are you sure you need this function? can you use DecodePALIGNRMask ?
		unsigned NumLaneElts = NumElts / NumLanes;

		Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
		zviUnsubmitted Not Done Reply Inline Actions static zvi: static
		unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);

		zviUnsubmitted Done Reply Inline Actions Please check identation zvi: Please check identation
		for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
		zviUnsubmitted Done Reply Inline Actions Please document the arguments zvi: Please document the arguments
		for (unsigned i = 0; i != NumLaneElts; ++i) {
		zviUnsubmitted Done Reply Inline Actions std::max(VectorWidth / 128, 1) zvi: std::max(VectorWidth / 128, 1)
		unsigned Base = i + Offset;
		// if i+offset is out of this lane then we actually need the other source
		// If Unary the other source is the first source.
		if (Base >= NumLaneElts)
		zviUnsubmitted Done Reply Inline Actions identation zvi: identation
		Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
		ShuffleMask.push_back(Base + l);
		}
		}
		}

		void X86InterleavedAccessGroup::deinterleave8bitStride3(
		ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
		unsigned VecElems) {

		// Example: Assuming we start from the following vectors:
		// Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
		// Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
		// Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7

		TransposedMatrix.resize(3);
		guyblankUnsubmitted Not Done Reply Inline Actions can this be an array? guyblank: can this be an array?
		m_zuckermanAuthorUnsubmitted Not Done Reply Inline Actions This way I think it's more readable m_zuckerman: This way I think it's more readable
		SmallVector<uint32_t, 32> Concat;
		SmallVector<uint32_t, 32> VPShuf;
		SmallVector<uint32_t, 32> VPAlign[2];
		SmallVector<uint32_t, 32> VPAlign2;
		SmallVector<uint32_t, 32> VPAlign3;
		SmallVector<uint32_t, 3> GroupSize;
		Value Vec[3], TempVector[3];

		MVT VT = MVT::getVT(Shuffles[0]->getType());
		guyblankUnsubmitted Done Reply Inline Actions ++i, below as well guyblank: ++i, below as well

		for (unsigned i = 0; i < VecElems && VecElems == 32; ++i)
		Concat.push_back(i);

		createShuffleStride(VT, 3, VPShuf);
		setGroupSize(VT, GroupSize);

		for (int i = 0; i < 2; i++)
		DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);

		DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
		DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);

		for (int i = 0; i < 3; i++)
		Vec[i] = VecElems == 32
		? Builder.CreateShuffleVector(InVec[i], InVec[i + 3], Concat)
		: InVec[i];
		zviUnsubmitted Not Done Reply Inline Actions So the last argument is always passed with the value true to all calls of this function? zvi: So the last argument is always passed with the value true to all calls of this function?
		m_zuckermanAuthorUnsubmitted Not Done Reply Inline Actions No, the default mode is false like row 438. m_zuckerman: No, the default mode is false like row 438.

		// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
		// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
		// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7

		for (int i = 0; i < 3; i++)
		Vec[i] = Builder.CreateShuffleVector(
		Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);

		// TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
		// TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
		// TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
		zviUnsubmitted Done Reply Inline Actions You could already create the Undef value instead of only its type here. zvi: You could already create the Undef value instead of only its type here.

		for (int i = 0; i < 3; i++)
		TempVector[i] =
		Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);

		// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
		// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
		// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7

		for (int i = 0; i < 3; i++)
		Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
		VPAlign[1]);

		// TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
		// TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
		// TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7

		Value *TempVec = Builder.CreateShuffleVector(
		Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
		TransposedMatrix[0] = Builder.CreateShuffleVector(
		Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
		TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
		TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];

		return;
		zviUnsubmitted Done Reply Inline Actions The second vector argument should be an undef, and the shuffle mask should select only elements from the first vector operand. This will eventually happen in InstCombine or DAGCombine, but why not avoid this compile-time overhead? zvi: The second vector argument should be an undef, and the shuffle mask should select only elements…
		m_zuckermanAuthorUnsubmitted Not Done Reply Inline Actions I want to reuse the createAlignMask function. The function according to two vectors and not one vector and one undef. m_zuckerman: I want to reuse the createAlignMask function. The function according to two vectors and not one…
		zviUnsubmitted Not Done Reply Inline Actions You can pass an argument to DecodePALIGNRMask saying whether the resulting shuffle mask is for a single source vector or two vector sources. It just so happens that it will take the same value of AlignDirection. zvi: You can pass an argument to DecodePALIGNRMask saying whether the resulting shuffle mask is for…
		}

void X86InterleavedAccessGroup::transpose_4x4(		void X86InterleavedAccessGroup::transpose_4x4(
ArrayRef<Instruction *> Matrix,		ArrayRef<Instruction *> Matrix,
SmallVectorImpl<Value *> &TransposedMatrix) {		SmallVectorImpl<Value *> &TransposedMatrix) {
assert(Matrix.size() == 4 && "Invalid matrix size");		assert(Matrix.size() == 4 && "Invalid matrix size");
TransposedMatrix.resize(4);		TransposedMatrix.resize(4);

// dst = src1[0,1],src2[0,1]		// dst = src1[0,1],src2[0,1]
uint32_t IntMask1[] = {0, 1, 4, 5};		uint32_t IntMask1[] = {0, 1, 4, 5};
Show All 26 Lines	bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
SmallVector<Instruction *, 4> DecomposedVectors;		SmallVector<Instruction *, 4> DecomposedVectors;
SmallVector<Value *, 4> TransposedVectors;		SmallVector<Value *, 4> TransposedVectors;
VectorType *ShuffleTy = Shuffles[0]->getType();		VectorType *ShuffleTy = Shuffles[0]->getType();

if (isa<LoadInst>(Inst)) {		if (isa<LoadInst>(Inst)) {
// Try to generate target-sized register(/instruction).		// Try to generate target-sized register(/instruction).
decompose(Inst, Factor, ShuffleTy, DecomposedVectors);		decompose(Inst, Factor, ShuffleTy, DecomposedVectors);

		Type *ShuffleEltTy = Inst->getType();
		unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
// Perform matrix-transposition in order to compute interleaved		// Perform matrix-transposition in order to compute interleaved
// results by generating some sort of (optimized) target-specific		// results by generating some sort of (optimized) target-specific
// instructions.		// instructions.

		switch (NumSubVecElems) {
		default:
		return false;
		case 4:
transpose_4x4(DecomposedVectors, TransposedVectors);		transpose_4x4(DecomposedVectors, TransposedVectors);
		break;
		case 8:
		case 16:
		case 32:
		deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
		NumSubVecElems);
		break;
		}

// Now replace the unoptimized-interleaved-vectors with the		// Now replace the unoptimized-interleaved-vectors with the
// transposed-interleaved vectors.		// transposed-interleaved vectors.
for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)		for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);		Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);

return true;		return true;
}		}
▲ Show 20 Lines • Show All 82 Lines • Show Last 20 Lines

test/CodeGen/X86/x86-interleaved-access.ll

	Show First 20 Lines • Show All 959 Lines • ▼ Show 20 Lines
	%interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>			%interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
	store <32 x i8> %interleaved.vec, <32 x i8>* %p			store <32 x i8> %interleaved.vec, <32 x i8>* %p
	ret void			ret void
	}			}

	define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){			define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
	; AVX1-LABEL: interleaved_load_vf32_i8_stride3:			; AVX1-LABEL: interleaved_load_vf32_i8_stride3:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vmovdqa (%rdi), %ymm1			; AVX1-NEXT: vmovdqa (%rdi), %xmm0
	; AVX1-NEXT: vmovdqa 32(%rdi), %ymm0			; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
	; AVX1-NEXT: vmovdqa 64(%rdi), %ymm3			; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4			; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm4[1,4,7,10,13]			; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4
	; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero			; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5
	; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2			; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2			; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
	; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]			; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
	; AVX1-NEXT: vandnps %ymm2, %ymm8, %ymm10			; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5			; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
	; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14,u,u,u,u,u]			; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm1[u,u,u,u,u]			; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
	; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6			; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]			; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6			; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,4,7,10,13]			; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm7			; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6			; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u]			; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2			; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2
	; AVX1-NEXT: vandps %ymm8, %ymm2, %ymm2			; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vorps %ymm10, %ymm2, %ymm8			; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14]			; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero			; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
	; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2			; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2			; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
	; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]			; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
	; AVX1-NEXT: vandnps %ymm2, %ymm10, %ymm11			; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
	; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u]			; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm5[0,3,6,9,12,15,u,u,u,u,u]			; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm6
	; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2			; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3
	; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2			; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,5,8,11,14]			; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
	; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u]
	; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
	; AVX1-NEXT: vandps %ymm10, %ymm2, %ymm2
	; AVX1-NEXT: vorps %ymm11, %ymm2, %ymm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15]
	; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
	; AVX1-NEXT: vandnps %ymm3, %ymm10, %ymm3
	; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm5[1,4,7,10,13,u,u,u,u,u,u]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u]
	; AVX1-NEXT: vpor %xmm4, %xmm1, %xmm1
	; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15]
	; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0			; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: vandps %ymm10, %ymm0, %ymm0
	; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
	; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1			; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
	; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1			; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
	; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1			; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
	; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0			; AVX1-NEXT: vpaddb %xmm9, %xmm2, %xmm2
	; AVX1-NEXT: vpaddb %xmm0, %xmm8, %xmm0			; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: interleaved_load_vf32_i8_stride3:			; AVX-LABEL: interleaved_load_vf32_i8_stride3:
	; AVX2: # BB#0:			; AVX: # BB#0:
	; AVX2-NEXT: vmovdqa (%rdi), %ymm1			; AVX-NEXT: vmovdqa (%rdi), %xmm0
	; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2			; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
	; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0			; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0>			; AVX-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
	; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3			; AVX-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
	; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4			; AVX-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
	; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero			; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13]			; AVX-NEXT: vpshufb %ymm3, %ymm0, %ymm0
	; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3			; AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1
	; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u]			; AVX-NEXT: vpshufb %ymm3, %ymm2, %ymm2
	; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4			; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3			; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13]			; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero			; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
	; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5			; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
	; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5			; AVX-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0]			; AVX-NEXT: vpaddb %ymm2, %ymm1, %ymm1
	; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4			; AVX-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255>			; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
	; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5			; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6			; AVX-NEXT: retq
	; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero
	; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14]
	; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5
	; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u]
	; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
	; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero
	; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
	; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
	; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u>
	; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero
	; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15]
	; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1
	; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u]
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero
	; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15]
	; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vpaddb %ymm0, %ymm5, %ymm0
	; AVX2-NEXT: vpaddb %ymm0, %ymm4, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512-LABEL: interleaved_load_vf32_i8_stride3:
	; AVX512: # BB#0:
	; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
	; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0
	; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0>
	; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; AVX512-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3
	; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
	; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero
	; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13]
	; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3
	; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u]
	; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
	; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13]
	; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero
	; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5
	; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
	; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0]
	; AVX512-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
	; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255>
	; AVX512-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5
	; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
	; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero
	; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14]
	; AVX512-NEXT: vpor %xmm7, %xmm5, %xmm5
	; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u]
	; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
	; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14]
	; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero
	; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6
	; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
	; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
	; AVX512-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
	; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u>
	; AVX512-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero
	; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15]
	; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1
	; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u]
	; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
	; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero
	; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15]
	; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
	; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX512-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
	; AVX512-NEXT: vpaddb %ymm0, %ymm5, %ymm0
	; AVX512-NEXT: vpaddb %ymm0, %ymm4, %ymm0
	; AVX512-NEXT: retq
	%wide.vec = load <96 x i8>, <96 x i8>* %ptr			%wide.vec = load <96 x i8>, <96 x i8>* %ptr
	%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>			%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
	%v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>			%v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
	%v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>			%v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
	%add1 = add <32 x i8> %v1, %v2			%add1 = add <32 x i8> %v1, %v2
	%add2 = add <32 x i8> %v3, %add1			%add2 = add <32 x i8> %v3, %add1
	ret <32 x i8> %add2			ret <32 x i8> %add2
	}			}

	define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){			define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
	; AVX1-LABEL: interleaved_load_vf16_i8_stride3:			; AVX1-LABEL: interleaved_load_vf16_i8_stride3:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vmovdqa (%rdi), %ymm0			; AVX1-NEXT: vmovdqa (%rdi), %xmm0
	; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1			; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2			; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm2[2,5,8,11,14,u,u,u,u,u]			; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u]			; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3			; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]			; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3			; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1,4,7,10,13]			; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3			; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u]			; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[0,3,6,9,12,15,u,u,u,u,u]			; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
	; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5			; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4			; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[2,5,8,11,14]			; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
	; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4			; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13,u,u,u,u,u,u]			; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u]			; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15]
	; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0
	; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
	; AVX1-NEXT: vzeroupper
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: interleaved_load_vf16_i8_stride3:			; AVX-LABEL: interleaved_load_vf16_i8_stride3:
	; AVX2: # BB#0:			; AVX: # BB#0:
	; AVX2-NEXT: vmovdqa (%rdi), %ymm0			; AVX-NEXT: vmovdqa (%rdi), %xmm0
	; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1			; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u>			; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2			; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
	; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3			; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14],zero,zero,zero,zero,zero			; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm2[1,4,7,10,13]			; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
	; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2			; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255>			; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
	; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3			; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
	; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4			; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15],zero,zero,zero,zero,zero			; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14]			; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
	; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3			; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u>			; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
	; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0			; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1			; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
	; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,10,13],zero,zero,zero,zero,zero,zero			; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15]			; AVX-NEXT: retq
	; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
	; AVX2-NEXT: vpaddb %xmm0, %xmm3, %xmm0
	; AVX2-NEXT: vpaddb %xmm0, %xmm2, %xmm0
	; AVX2-NEXT: vzeroupper
	; AVX2-NEXT: retq
	;
	; AVX512-LABEL: interleaved_load_vf16_i8_stride3:
	; AVX512: # BB#0:
	; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
	; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u>
	; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2
	; AVX512-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm1
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
	; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14],zero,zero,zero,zero,zero
	; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm1[1,4,7,10,13]
	; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1
	; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255>
	; AVX512-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm3
	; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
	; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15],zero,zero,zero,zero,zero
	; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14]
	; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3
	; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u>
	; AVX512-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
	; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero
	; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15]
	; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX512-NEXT: vpaddb %xmm0, %xmm3, %xmm0
	; AVX512-NEXT: vpaddb %xmm0, %xmm1, %xmm0
	; AVX512-NEXT: vzeroupper
	; AVX512-NEXT: retq
	%wide.vec = load <48 x i8>, <48 x i8>* %ptr			%wide.vec = load <48 x i8>, <48 x i8>* %ptr
	%v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>			%v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
	%v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>			%v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
	%v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>			%v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
	%add1 = add <16 x i8> %v1, %v2			%add1 = add <16 x i8> %v1, %v2
	%add2 = add <16 x i8> %v3, %add1			%add2 = add <16 x i8> %v3, %add1
	ret <16 x i8> %add2			ret <16 x i8> %add2
	}			}

	define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){			define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
	; AVX1-LABEL: interleaved_load_vf8_i8_stride3:			; AVX1-LABEL: interleaved_load_vf8_i8_stride3:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vmovdqa (%rdi), %ymm0			; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1			; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]			; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]			; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,3,255,6,255,1,255,4,255,7,255,2,255,5,255]
	; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2			; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]			; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]			; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3			; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]			; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]			; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
	; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0			; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0			; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5,6,7]
	; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0			; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1
	; AVX1-NEXT: vzeroupper			; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
				; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
				; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX-LABEL: interleaved_load_vf8_i8_stride3:			; AVX-LABEL: interleaved_load_vf8_i8_stride3:
	; AVX: # BB#0:			; AVX: # BB#0:
	; AVX-NEXT: vmovdqa (%rdi), %ymm0			; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
	; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1			; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
	; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]			; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
	; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]			; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,3,255,6,255,1,255,4,255,7,255,2,255,5,255]
	; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2			; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]			; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]			; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
	; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3			; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
	; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]			; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]			; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
	; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0			; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0			; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5,6,7]
	; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0			; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1
	; AVX-NEXT: vzeroupper			; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
				; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
				; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%wide.vec = load <24 x i8>, <24 x i8>* %ptr			%wide.vec = load <24 x i8>, <24 x i8>* %ptr
	%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>			%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
	%v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>			%v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
	%v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>			%v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
	%add1 = add <8 x i8> %v1, %v2			%add1 = add <8 x i8> %v1, %v2
	%add2 = add <8 x i8> %v3, %add1			%add2 = add <8 x i8> %v3, %add1
	ret <8 x i8> %add2			ret <8 x i8> %add2
	▲ Show 20 Lines • Show All 262 Lines • Show Last 20 Lines

test/Transforms/InterleavedAccess/X86/interleavedLoad.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx2 -interleaved-access -S \| FileCheck %s --check-prefix=AVX2			; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx2 -interleaved-access -S \| FileCheck %s --check-prefix=AVX2
				zviUnsubmitted Not Done Reply Inline Actions Sorry I noticed it only now: Can you update this file with a AVX512 run and rebase the patch? zvi: Sorry I noticed it only now: Can you update this file with a AVX512 run and rebase the patch?
	; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx2 -interleaved-access -S \| FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512			; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw -mattr=+avx512vl -interleaved-access -S \| FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512
				zviUnsubmitted Not Done Reply Inline Actions This run command has the wrong -mattr features set zvi: This run command has the wrong -mattr features set

	define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){			define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
	; AVX2-LABEL: @interleaved_load_vf32_i8_stride3(			; AVX2-LABEL: @interleaved_load_vf32_i8_stride3(
	; AVX2-NEXT: [[WIDE_VEC:%.]] = load <96 x i8>, <96 x i8> [[PTR:%.*]]			; AVX2-NEXT: [[TMP1:%.]] = bitcast <96 x i8> [[PTR:%.]] to <16 x i8>
	; AVX2-NEXT: [[V1:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93>			; AVX2-NEXT: [[TMP2:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 0
	; AVX2-NEXT: [[V2:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94>			; AVX2-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[TMP2]]
	; AVX2-NEXT: [[V3:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95>			; AVX2-NEXT: [[TMP4:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 1
	; AVX2-NEXT: [[ADD1:%.*]] = add <32 x i8> [[V1]], [[V2]]			; AVX2-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[TMP4]]
	; AVX2-NEXT: [[ADD2:%.*]] = add <32 x i8> [[V3]], [[ADD1]]			; AVX2-NEXT: [[TMP6:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 2
				; AVX2-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> [[TMP6]]
				; AVX2-NEXT: [[TMP8:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 3
				; AVX2-NEXT: [[TMP9:%.]] = load <16 x i8>, <16 x i8> [[TMP8]]
				; AVX2-NEXT: [[TMP10:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 4
				; AVX2-NEXT: [[TMP11:%.]] = load <16 x i8>, <16 x i8> [[TMP10]]
				; AVX2-NEXT: [[TMP12:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 5
				; AVX2-NEXT: [[TMP13:%.]] = load <16 x i8>, <16 x i8> [[TMP12]]
				; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
				; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
				; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
				; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>
				; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>
				; AVX2-NEXT: [[TMP19:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>
				; AVX2-NEXT: [[TMP20:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> [[TMP17]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
				; AVX2-NEXT: [[TMP21:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> [[TMP18]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
				; AVX2-NEXT: [[TMP22:%.*]] = shufflevector <32 x i8> [[TMP18]], <32 x i8> [[TMP19]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
				; AVX2-NEXT: [[TMP23:%.*]] = shufflevector <32 x i8> [[TMP21]], <32 x i8> [[TMP20]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
				; AVX2-NEXT: [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP22]], <32 x i8> [[TMP21]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
				; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP20]], <32 x i8> [[TMP22]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
				; AVX2-NEXT: [[TMP26:%.*]] = shufflevector <32 x i8> [[TMP24]], <32 x i8> undef, <32 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20>
				; AVX2-NEXT: [[TMP27:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> undef, <32 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25>
				; AVX2-NEXT: [[ADD1:%.*]] = add <32 x i8> [[TMP27]], [[TMP26]]
				; AVX2-NEXT: [[ADD2:%.*]] = add <32 x i8> [[TMP25]], [[ADD1]]
	; AVX2-NEXT: ret <32 x i8> [[ADD2]]			; AVX2-NEXT: ret <32 x i8> [[ADD2]]
	;			;
	%wide.vec = load <96 x i8>, <96 x i8>* %ptr			%wide.vec = load <96 x i8>, <96 x i8>* %ptr
	%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>			%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
	%v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>			%v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
	%v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>			%v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
	%add1 = add <32 x i8> %v1, %v2			%add1 = add <32 x i8> %v1, %v2
	%add2 = add <32 x i8> %v3, %add1			%add2 = add <32 x i8> %v3, %add1
	ret <32 x i8> %add2			ret <32 x i8> %add2
	}			}

	define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){			define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
	; AVX2-LABEL: @interleaved_load_vf16_i8_stride3(			; AVX2-LABEL: @interleaved_load_vf16_i8_stride3(
	; AVX2-NEXT: [[WIDE_VEC:%.]] = load <48 x i8>, <48 x i8> [[PTR:%.*]]			; AVX2-NEXT: [[TMP1:%.]] = bitcast <48 x i8> [[PTR:%.]] to <16 x i8>
	; AVX2-NEXT: [[V1:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>			; AVX2-NEXT: [[TMP2:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 0
	; AVX2-NEXT: [[V2:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>			; AVX2-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[TMP2]]
	; AVX2-NEXT: [[V3:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>			; AVX2-NEXT: [[TMP4:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 1
	; AVX2-NEXT: [[ADD1:%.*]] = add <16 x i8> [[V1]], [[V2]]			; AVX2-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[TMP4]]
	; AVX2-NEXT: [[ADD2:%.*]] = add <16 x i8> [[V3]], [[ADD1]]			; AVX2-NEXT: [[TMP6:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 2
				; AVX2-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> [[TMP6]]
				; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
				; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
				; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
				; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP8]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
				; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
				; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
				; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP11]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
				; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP12]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
				; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> [[TMP13]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
				; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> undef, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
				; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> undef, <16 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
				; AVX2-NEXT: [[ADD1:%.*]] = add <16 x i8> [[TMP18]], [[TMP17]]
				; AVX2-NEXT: [[ADD2:%.*]] = add <16 x i8> [[TMP16]], [[ADD1]]
	; AVX2-NEXT: ret <16 x i8> [[ADD2]]			; AVX2-NEXT: ret <16 x i8> [[ADD2]]
	;			;
	%wide.vec = load <48 x i8>, <48 x i8>* %ptr			%wide.vec = load <48 x i8>, <48 x i8>* %ptr
	%v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>			%v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
	%v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>			%v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
	%v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>			%v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
	%add1 = add <16 x i8> %v1, %v2			%add1 = add <16 x i8> %v1, %v2
	%add2 = add <16 x i8> %v3, %add1			%add2 = add <16 x i8> %v3, %add1
	ret <16 x i8> %add2			ret <16 x i8> %add2
	}			}

	define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){			define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
	; AVX2-LABEL: @interleaved_load_vf8_i8_stride3(			; AVX2-LABEL: @interleaved_load_vf8_i8_stride3(
	; AVX2-NEXT: [[WIDE_VEC:%.]] = load <24 x i8>, <24 x i8> [[PTR:%.*]]			; AVX2-NEXT: [[TMP1:%.]] = bitcast <24 x i8> [[PTR:%.]] to <8 x i8>
	; AVX2-NEXT: [[V1:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>			; AVX2-NEXT: [[TMP2:%.]] = getelementptr <8 x i8>, <8 x i8> [[TMP1]], i32 0
	; AVX2-NEXT: [[V2:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>			; AVX2-NEXT: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[TMP2]]
	; AVX2-NEXT: [[V3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>			; AVX2-NEXT: [[TMP4:%.]] = getelementptr <8 x i8>, <8 x i8> [[TMP1]], i32 1
	; AVX2-NEXT: [[ADD1:%.*]] = add <8 x i8> [[V1]], [[V2]]			; AVX2-NEXT: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[TMP4]]
	; AVX2-NEXT: [[ADD2:%.*]] = add <8 x i8> [[V3]], [[ADD1]]			; AVX2-NEXT: [[TMP6:%.]] = getelementptr <8 x i8>, <8 x i8> [[TMP1]], i32 2
				; AVX2-NEXT: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[TMP6]]
				; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 1, i32 4, i32 7, i32 2, i32 5>
				; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 1, i32 4, i32 7, i32 2, i32 5>
				; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 1, i32 4, i32 7, i32 2, i32 5>
				; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP8]], <8 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
				; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <8 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
				; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> [[TMP10]], <8 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
				; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <8 x i8> [[TMP12]], <8 x i8> [[TMP11]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
				; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP12]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
				; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP13]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
				; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> undef, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
				; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP14]], <8 x i8> undef, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
				; AVX2-NEXT: [[ADD1:%.*]] = add <8 x i8> [[TMP18]], [[TMP16]]
				; AVX2-NEXT: [[ADD2:%.*]] = add <8 x i8> [[TMP17]], [[ADD1]]
	; AVX2-NEXT: ret <8 x i8> [[ADD2]]			; AVX2-NEXT: ret <8 x i8> [[ADD2]]
	;			;
	%wide.vec = load <24 x i8>, <24 x i8>* %ptr			%wide.vec = load <24 x i8>, <24 x i8>* %ptr
	%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>			%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
	%v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>			%v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
	%v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>			%v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
	%add1 = add <8 x i8> %v1, %v2			%add1 = add <8 x i8> %v1, %v2
	%add2 = add <8 x i8> %v3, %add1			%add2 = add <8 x i8> %v3, %add1
	ret <8 x i8> %add2			ret <8 x i8> %add2
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[X86][LLVM]Expanding Supports lowerInterleavedLoad() in X86InterleavedAccess (VF{8|16|32} stride 3).
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 113978

lib/Target/X86/X86InterleavedAccess.cpp

test/CodeGen/X86/x86-interleaved-access.ll

test/Transforms/InterleavedAccess/X86/interleavedLoad.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86][LLVM]Expanding Supports lowerInterleavedLoad() in X86InterleavedAccess (VF{8|16|32} stride 3).ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 113978

lib/Target/X86/X86InterleavedAccess.cpp

test/CodeGen/X86/x86-interleaved-access.ll

test/Transforms/InterleavedAccess/X86/interleavedLoad.ll

[X86][LLVM]Expanding Supports lowerInterleavedLoad() in X86InterleavedAccess (VF{8|16|32} stride 3).
ClosedPublic