Diff 266499

llvm/lib/Target/X86/X86InterleavedAccess.cpp

Show First 20 Lines • Show All 144 Lines • ▼ Show 20 Lines	if (isa<LoadInst>(Inst)) {
if (cast<LoadInst>(Inst)->getPointerAddressSpace())		if (cast<LoadInst>(Inst)->getPointerAddressSpace())
return false;		return false;
} else		} else
WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());		WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());

// We support shuffle represents stride 4 for byte type with size of		// We support shuffle represents stride 4 for byte type with size of
// WideInstSize.		// WideInstSize.
if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)		if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
return true;		return true;

if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&		if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
(WideInstSize == 256 \|\| WideInstSize == 512 \|\| WideInstSize == 1024 \|\|		(WideInstSize == 256 \|\| WideInstSize == 512 \|\| WideInstSize == 1024 \|\|
WideInstSize == 2048))		WideInstSize == 2048))
return true;		return true;

if (ShuffleElemSize == 8 && Factor == 3 &&		if (ShuffleElemSize == 8 && Factor == 3 &&
(WideInstSize == 384 \|\| WideInstSize == 768 \|\| WideInstSize == 1536))		(WideInstSize == 384 \|\| WideInstSize == 768 \|\| WideInstSize == 1536))
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	if (VecLength == 768 \|\| VecLength == 1536) {
VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);		VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
NumLoads = NumSubVectors * (VecLength / 384);		NumLoads = NumSubVectors * (VecLength / 384);
} else {		} else {
VecBaseTy = SubVecTy;		VecBaseTy = SubVecTy;
VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());		VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);		VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
}		}
// Generate N loads of T type.		// Generate N loads of T type.
		assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized() &&
		"VecBaseTy's size must be a multiple of 8");
		const Align FirstAlignment = LI->getAlign();
		const Align SubsequentAlignment = commonAlignment(
		gchateletAuthorUnsubmitted Done Reply Inline Actions I thought `TypeSize` would be Bytes but it's Bits, fix is coming. gchatelet: I thought `TypeSize` would be Bytes but it's Bits, fix is coming.
		FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize() / 8);
		Align Alignment = FirstAlignment;
for (unsigned i = 0; i < NumLoads; i++) {		for (unsigned i = 0; i < NumLoads; i++) {
// TODO: Support inbounds GEP.		// TODO: Support inbounds GEP.
Value *NewBasePtr =		Value *NewBasePtr =
Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));		Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
Instruction *NewLoad =		Instruction *NewLoad =
Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlign());		Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
DecomposedVectors.push_back(NewLoad);		DecomposedVectors.push_back(NewLoad);
		nlopesUnsubmitted Done Reply Inline Actions at least iteration at 0 you could take LI->getAlign(), as it might be larger than ABI alignment. Propagating this larger alignment to subsequent loads requires a bit more code; not sure it's worth it. nlopes: at least iteration at 0 you could take LI->getAlign(), as it might be larger than ABI alignment.
		gchateletAuthorUnsubmitted Done Reply Inline Actions Just updated the code, I think it's what you were suggesting. gchatelet: Just updated the code, I think it's what you were suggesting.
		Alignment = SubsequentAlignment;
}		}
}		}
		jdoerfertUnsubmitted Done Reply Inline Actions Nit: msg for the assert. jdoerfert: Nit: msg for the assert.

// Changing the scale of the vector type by reducing the number of elements and		// Changing the scale of the vector type by reducing the number of elements and
// doubling the scalar size.		// doubling the scalar size.
static MVT scaleVectorType(MVT VT) {		static MVT scaleVectorType(MVT VT) {
unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;		unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),		return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
VT.getVectorNumElements() / 2);		VT.getVectorNumElements() / 2);
}		}
Show All 19 Lines
// The first one is vpshufed and the second is a type of "blend" shuffle.		// The first one is vpshufed and the second is a type of "blend" shuffle.
// By computing the shuffle on a sequence of 16 elements(one lane) and add the		// By computing the shuffle on a sequence of 16 elements(one lane) and add the
// correct offset. We are creating a vpsuffed + blend sequence between two		// correct offset. We are creating a vpsuffed + blend sequence between two
// shuffles.		// shuffles.
static void genShuffleBland(MVT VT, ArrayRef<int> Mask,		static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &Out, int LowOffset,		SmallVectorImpl<int> &Out, int LowOffset,
int HighOffset) {		int HighOffset) {
assert(VT.getSizeInBits() >= 256 &&		assert(VT.getSizeInBits() >= 256 &&
"This function doesn't accept width smaller then 256");		"This function doesn't accept width smaller then 256");
unsigned NumOfElm = VT.getVectorNumElements();		unsigned NumOfElm = VT.getVectorNumElements();
for (unsigned i = 0; i < Mask.size(); i++)		for (unsigned i = 0; i < Mask.size(); i++)
Out.push_back(Mask[i] + LowOffset);		Out.push_back(Mask[i] + LowOffset);
for (unsigned i = 0; i < Mask.size(); i++)		for (unsigned i = 0; i < Mask.size(); i++)
Out.push_back(Mask[i] + HighOffset + NumOfElm);		Out.push_back(Mask[i] + HighOffset + NumOfElm);
}		}

// reorderSubVector returns the data to is the original state. And de-facto is		// reorderSubVector returns the data to is the original state. And de-facto is
Show All 17 Lines
static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,		static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,		ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
unsigned VecElems, unsigned Stride,		unsigned VecElems, unsigned Stride,
IRBuilder<> &Builder) {		IRBuilder<> &Builder) {

if (VecElems == 16) {		if (VecElems == 16) {
for (unsigned i = 0; i < Stride; i++)		for (unsigned i = 0; i < Stride; i++)
TransposedMatrix[i] = Builder.CreateShuffleVector(		TransposedMatrix[i] = Builder.CreateShuffleVector(
Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);		Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
return;		return;
}		}

SmallVector<int, 32> OptimizeShuf;		SmallVector<int, 32> OptimizeShuf;
Value *Temp[8];		Value *Temp[8];

for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {		for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,		genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
(i + 1) / Stride * 16);		(i + 1) / Stride * 16);
Temp[i / 2] = Builder.CreateShuffleVector(		Temp[i / 2] = Builder.CreateShuffleVector(
Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);		Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
OptimizeShuf.clear();		OptimizeShuf.clear();
}		}

if (VecElems == 32) {		if (VecElems == 32) {
std::copy(Temp, Temp + Stride, TransposedMatrix.begin());		std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
return;		return;
}		} else
else
for (unsigned i = 0; i < Stride; i++)		for (unsigned i = 0; i < Stride; i++)
TransposedMatrix[i] =		TransposedMatrix[i] =
Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);		Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
}		}

void X86InterleavedAccessGroup::interleave8bitStride4VF8(		void X86InterleavedAccessGroup::interleave8bitStride4VF8(
ArrayRef<Instruction *> Matrix,		ArrayRef<Instruction *> Matrix,
SmallVectorImpl<Value *> &TransposedMatrix) {		SmallVectorImpl<Value *> &TransposedMatrix) {
// Assuming we start from the following vectors:		// Assuming we start from the following vectors:
// Matrix[0]= c0 c1 c2 c3 c4 ... c7		// Matrix[0]= c0 c1 c2 c3 c4 ... c7
// Matrix[1]= m0 m1 m2 m3 m4 ... m7		// Matrix[1]= m0 m1 m2 m3 m4 ... m7
▲ Show 20 Lines • Show All 354 Lines • ▼ Show 20 Lines	Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
VPAlign[2]);		VPAlign[2]);

// TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2		// TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
// TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5		// TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
// TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7		// TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7

unsigned NumOfElm = VT.getVectorNumElements();		unsigned NumOfElm = VT.getVectorNumElements();
group2Shuffle(VT, GroupSize, VPShuf);		group2Shuffle(VT, GroupSize, VPShuf);
reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);		reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
}		}

void X86InterleavedAccessGroup::transpose_4x4(		void X86InterleavedAccessGroup::transpose_4x4(
ArrayRef<Instruction *> Matrix,		ArrayRef<Instruction *> Matrix,
SmallVectorImpl<Value *> &TransposedMatrix) {		SmallVectorImpl<Value *> &TransposedMatrix) {
assert(Matrix.size() == 4 && "Invalid matrix size");		assert(Matrix.size() == 4 && "Invalid matrix size");
TransposedMatrix.resize(4);		TransposedMatrix.resize(4);

▲ Show 20 Lines • Show All 153 Lines • Show Last 20 Lines

llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx2 -interleaved-access -S \| FileCheck %s --check-prefix=AVX2			; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx2 -interleaved-access -S \| FileCheck %s --check-prefix=AVX2
	; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw -mattr=+avx512vl -interleaved-access -S \| FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512			; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw -mattr=+avx512vl -interleaved-access -S \| FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512

	define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){			define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
	; AVX2-LABEL: @interleaved_load_vf32_i8_stride3(			; AVX2-LABEL: @interleaved_load_vf32_i8_stride3(
	; AVX2-NEXT: [[TMP1:%.]] = bitcast <96 x i8> [[PTR:%.]] to <16 x i8>			; AVX2-NEXT: [[TMP1:%.]] = bitcast <96 x i8> [[PTR:%.]] to <16 x i8>
	; AVX2-NEXT: [[TMP2:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 0			; AVX2-NEXT: [[TMP2:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 0
	; AVX2-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[TMP2]], align 128			; AVX2-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[TMP2]], align 128
	; AVX2-NEXT: [[TMP4:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 1			; AVX2-NEXT: [[TMP4:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 1
	; AVX2-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[TMP4]], align 128			; AVX2-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[TMP4]], align 16
	; AVX2-NEXT: [[TMP6:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 2			; AVX2-NEXT: [[TMP6:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 2
	; AVX2-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> [[TMP6]], align 128			; AVX2-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> [[TMP6]], align 16
	; AVX2-NEXT: [[TMP8:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 3			; AVX2-NEXT: [[TMP8:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 3
	; AVX2-NEXT: [[TMP9:%.]] = load <16 x i8>, <16 x i8> [[TMP8]], align 128			; AVX2-NEXT: [[TMP9:%.]] = load <16 x i8>, <16 x i8> [[TMP8]], align 16
	; AVX2-NEXT: [[TMP10:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 4			; AVX2-NEXT: [[TMP10:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 4
	; AVX2-NEXT: [[TMP11:%.]] = load <16 x i8>, <16 x i8> [[TMP10]], align 128			; AVX2-NEXT: [[TMP11:%.]] = load <16 x i8>, <16 x i8> [[TMP10]], align 16
	; AVX2-NEXT: [[TMP12:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 5			; AVX2-NEXT: [[TMP12:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 5
	; AVX2-NEXT: [[TMP13:%.]] = load <16 x i8>, <16 x i8> [[TMP12]], align 128			; AVX2-NEXT: [[TMP13:%.]] = load <16 x i8>, <16 x i8> [[TMP12]], align 16
	; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>			; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
	; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>			; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
	; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>			; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
	; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>			; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>
	; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>			; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>
	; AVX2-NEXT: [[TMP19:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>			; AVX2-NEXT: [[TMP19:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 18, i32 21, i32 24, i32 27, i32 30, i32 17, i32 20, i32 23, i32 26, i32 29>
	; AVX2-NEXT: [[TMP20:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> [[TMP17]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>			; AVX2-NEXT: [[TMP20:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> [[TMP17]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
	; AVX2-NEXT: [[TMP21:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> [[TMP18]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>			; AVX2-NEXT: [[TMP21:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> [[TMP18]], <32 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58>
	Show All 17 Lines
	}			}

	define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){			define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
	; AVX2-LABEL: @interleaved_load_vf16_i8_stride3(			; AVX2-LABEL: @interleaved_load_vf16_i8_stride3(
	; AVX2-NEXT: [[TMP1:%.]] = bitcast <48 x i8> [[PTR:%.]] to <16 x i8>			; AVX2-NEXT: [[TMP1:%.]] = bitcast <48 x i8> [[PTR:%.]] to <16 x i8>
	; AVX2-NEXT: [[TMP2:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 0			; AVX2-NEXT: [[TMP2:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 0
	; AVX2-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[TMP2]], align 64			; AVX2-NEXT: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[TMP2]], align 64
	; AVX2-NEXT: [[TMP4:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 1			; AVX2-NEXT: [[TMP4:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 1
	; AVX2-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[TMP4]], align 64			; AVX2-NEXT: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[TMP4]], align 16
	; AVX2-NEXT: [[TMP6:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 2			; AVX2-NEXT: [[TMP6:%.]] = getelementptr <16 x i8>, <16 x i8> [[TMP1]], i32 2
	; AVX2-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> [[TMP6]], align 64			; AVX2-NEXT: [[TMP7:%.]] = load <16 x i8>, <16 x i8> [[TMP6]], align 16
	; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>			; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
	; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>			; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
	; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>			; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
	; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP8]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>			; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP8]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
	; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>			; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
	; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>			; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
	; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP11]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>			; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP11]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
	; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP12]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>			; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP12]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
	▲ Show 20 Lines • Show All 95 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[Alignment] Fix misaligned interleaved loads
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 266499

llvm/lib/Target/X86/X86InterleavedAccess.cpp

llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll

This is an archive of the discontinued LLVM Phabricator instance.

[Alignment] Fix misaligned interleaved loadsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 266499

llvm/lib/Target/X86/X86InterleavedAccess.cpp

llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll

[Alignment] Fix misaligned interleaved loads
ClosedPublic