Index: lib/Target/X86/X86InterleavedAccess.cpp =================================================================== --- lib/Target/X86/X86InterleavedAccess.cpp +++ lib/Target/X86/X86InterleavedAccess.cpp @@ -74,6 +74,9 @@ unsigned NumSubVecElems); void interleave8bitStride4VF8(ArrayRef InputVectors, SmallVectorImpl &TransposedMatrix); + void interleave8bitStride3(ArrayRef InputVectors, + SmallVectorImpl &TransposedMatrix, + unsigned NumSubVecElems); void deinterleave8bitStride3(ArrayRef InputVectors, SmallVectorImpl &TransposedMatrix, unsigned NumSubVecElems); @@ -132,9 +135,9 @@ (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024)) return true; - if (ShuffleElemSize == 8 && isa(Inst) && Factor == 3 && + if (ShuffleElemSize == 8 && Factor == 3 && (WideInstSize == 384 || WideInstSize == 768)) - return true; + return true; return false; } @@ -495,6 +498,134 @@ return; } +// group2Shuffle reorder the shuffle stride back into continuous order. +// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} => +// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}. +static void group2Shuffle(MVT VT, SmallVectorImpl &Mask, + SmallVectorImpl &Output) { + int IndexGroup[3] = {0, 0, 0}; + int Index = 0; + int VectorWidth = VT.getSizeInBits(); + int VF = VT.getVectorNumElements(); + // Find the index of the different groups. + int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1; + for (int i = 0; i < 3; i++) { + IndexGroup[(Index * 3) % (VF / Lane)] = Index; + Index += Mask[i]; + } + // According to the index compute the convert mask. + for (int i = 0; i < VF / Lane; i++) { + Output.push_back(IndexGroup[i % 3]); + IndexGroup[i % 3]++; + } +} + +// genShuffleBland - Creates shuffle according to two vectors.This function is +// only works on instructions with lane inside 256 registers. According to +// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The +// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'. +// Where the 'LowOffset' refers to the first vector and the highOffset refers to +// the second vector. +// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20| +// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25| +// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31| +// For the sequence to work as a mirror to the load. +// We must consider the elements order as above. +// In this function we are combining two types of shuffles. +// The first one is vpshufed and the second is a type of "blend" shuffle. +// By computing the shuffle on a sequence of 16 elements(one lane) and add the +// correct offset. We are creating a vpsuffed + blend sequence between two +// shuffles. +static void genShuffleBland(MVT VT, SmallVectorImpl &Mask, + SmallVectorImpl &Out, int LowOffset, + int HighOffset) { + assert(VT.getSizeInBits() == 256 && + "This function works on only width of 256"); + unsigned NumOfElm = VT.getVectorNumElements(); + for (unsigned i = 0; i < Mask.size(); i++) + Out.push_back(Mask[i] + LowOffset); + for (unsigned i = 0; i < Mask.size(); i++) + Out.push_back(Mask[i] + HighOffset + NumOfElm); +} + +void X86InterleavedAccessGroup::interleave8bitStride3( + ArrayRef InVec, SmallVectorImpl &TransposedMatrix, + unsigned VecElems) { + + // Example: Assuming we start from the following vectors: + // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7 + // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 + // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7 + + TransposedMatrix.resize(3); + SmallVector GroupSize; + SmallVector VPShuf; + SmallVector VPAlign[3]; + SmallVector VPAlign2; + SmallVector VPAlign3; + SmallVector OptimizeShuf[3]; + Value *Vec[3], *TempVector[3]; + MVT VT = MVT::getVectorVT(MVT::i8, VecElems); + + setGroupSize(VT, GroupSize); + + for (int i = 0; i < 3; i++) + DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]); + + DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true); + DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true); + + // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2 + // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 + // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 + + Vec[0] = Builder.CreateShuffleVector( + InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2); + Vec[1] = Builder.CreateShuffleVector( + InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3); + Vec[2] = InVec[2]; + + // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2 + // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5 + // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7 + + for (int i = 0; i < 3; i++) + TempVector[i] = + Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]); + + // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1 + // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4 + // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 + + for (int i = 0; i < 3; i++) + Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3], + VPAlign[2]); + + // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2 + // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5 + // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7 + + group2Shuffle(VT, GroupSize, VPShuf); + + if (VT.getSizeInBits() <= 128) { + for (int i = 0; i < 3; i++) + TransposedMatrix[i] = Builder.CreateShuffleVector( + Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf); + return; + } + + unsigned NumOfElm = VT.getVectorNumElements(); + genShuffleBland(VT, VPShuf, OptimizeShuf[0], 0, 0); + genShuffleBland(VT, VPShuf, OptimizeShuf[1], 0, NumOfElm / 2); + genShuffleBland(VT, VPShuf, OptimizeShuf[2], NumOfElm / 2, NumOfElm / 2); + + for (int i = 0; i < 3; i++) + TransposedMatrix[i] = Builder.CreateShuffleVector( + Vec[(i * 2) % 3], Vec[(i * 2 + 1) % 3], OptimizeShuf[i]); + + return; +} + void X86InterleavedAccessGroup::transpose_4x4( ArrayRef Matrix, SmallVectorImpl &TransposedMatrix) { @@ -585,7 +716,12 @@ break; case 16: case 32: - interleave8bitStride4(DecomposedVectors, TransposedVectors, NumSubVecElems); + if (Factor == 4) + interleave8bitStride4(DecomposedVectors, TransposedVectors, + NumSubVecElems); + if (Factor == 3) + interleave8bitStride3(DecomposedVectors, TransposedVectors, + NumSubVecElems); break; default: return false; Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -1192,74 +1192,74 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) { ; AVX1-LABEL: interleaved_store_vf16_i8_stride3: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[u,6],zero,xmm0[u,7],zero,xmm0[u,8],zero,xmm0[u,9],zero,xmm0[u,10],zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[5,u],zero,xmm1[6,u],zero,xmm1[7,u],zero,xmm1[8,u],zero,xmm1[9,u],zero,xmm1[10] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3],zero,xmm3[5,6],zero,xmm3[8,9],zero,xmm3[11,12],zero,xmm3[14,15] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7],zero,zero,xmm2[8],zero,zero,xmm2[9],zero,zero -; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1],zero,xmm4[2,3],zero,xmm4[4,5],zero,xmm4[6,7],zero,xmm4[8,9],zero,xmm4[10] -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero -; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[6,7],zero,xmm0[8,9],zero,xmm0[10,11],zero,xmm0[12,13],zero,xmm0[14,15],zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[10],zero,zero,xmm2[11],zero,zero,xmm2[12],zero,zero,xmm2[13],zero,zero,xmm2[14],zero,zero,xmm2[15] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm3, (%rdi) +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vmovdqu %xmm1, 32(%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_store_vf16_i8_stride3: ; AVX2: # BB#0: -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,5,21,u,u,22,u,u,23,u,u,24,u,u,25,u,u,26] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,u,u,22,u,u,23,u,u,24,u,u,25,u,u,26,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255> -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[0,1,0,1,0,1,6,7,2,3,2,3,4,5,4,5] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,u,8,9,u,10,11,u,12,13,u,14,15,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[10,11,10,11,12,13,12,13,12,13,10,11,14,15,14,15] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, 32(%rdi) -; AVX2-NEXT: vmovdqu %ymm3, (%rdi) +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX2-NEXT: vmovdqu %xmm1, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_store_vf16_i8_stride3: ; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,5,21,u,u,22,u,u,23,u,u,24,u,u,25,u,u,26] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,u,u,22,u,u,23,u,u,24,u,u,25,u,u,26,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255> -; AVX512-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[0,1,0,1,0,1,6,7,2,3,2,3,4,5,4,5] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,7] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; AVX512-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,u,8,9,u,10,11,u,12,13,u,14,15,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[10,11,10,11,12,13,12,13,12,13,10,11,14,15,14,15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; AVX512-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu %ymm3, (%rdi) -; AVX512-NEXT: vextracti32x4 $2, %zmm0, 32(%rdi) +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128] +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX512-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> @@ -1272,134 +1272,84 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <96 x i8>* %p) { ; AVX1-LABEL: interleaved_store_vf32_i8_stride3: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[u,6],zero,xmm0[u,7],zero,xmm0[u,8],zero,xmm0[u,9],zero,xmm0[u,10],zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[5,u],zero,xmm1[6,u],zero,xmm1[7,u],zero,xmm1[8,u],zero,xmm1[9,u],zero,xmm1[10] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,2,3,u,4,5,u,6,7,u,8,9,u,10] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm4 -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; AVX1-NEXT: vandps %ymm3, %ymm4, %ymm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,1,0,1,0,1,6,7,2,3,2,3,4,5,4,5] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,7] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vandnps %ymm5, %ymm3, %ymm5 -; AVX1-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,7,u,6,9,u,8,11,u,10,13,u,12,15,u,14] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,u,1,2,u,3,4,u,5,6,u,7,8,u,9,10] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vandps %ymm3, %ymm5, %ymm5 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[12,12,11,11,12,12,11,11,13,13,14,14,14,14,15,15] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,0,1,1,1,1,2,2,4,4,3,3,4,4,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vandnps %ymm6, %ymm3, %ymm6 -; AVX1-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm2[5,u],zero,xmm2[6,u],zero,xmm2[7,u],zero,xmm2[8,u],zero,xmm2[9,u],zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[5],zero,xmm1[u,6],zero,xmm1[u,7],zero,xmm1[u,8],zero,xmm1[u,9],zero,xmm1[u,10] -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,u,7,6,u,9,8,u,11,10,u,13,12,u,15,14] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,5] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,10,11,12,13,12,13,8,9,14,15,14,15,14,15] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovups %ymm0, 64(%rdi) -; AVX1-NEXT: vmovups %ymm5, 32(%rdi) -; AVX1-NEXT: vmovups %ymm4, (%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vmovups %ymm2, 64(%rdi) +; AVX1-NEXT: vmovups %ymm1, 32(%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_store_vf32_i8_stride3: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[u,6],zero,xmm0[u,7],zero,xmm0[u,8],zero,xmm0[u,9],zero,xmm0[u,10],zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[5,u],zero,xmm1[6,u],zero,xmm1[7,u],zero,xmm1[8,u],zero,xmm1[9,u],zero,xmm1[10] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,2,3,u,4,5,u,6,7,u,8,9,u,10] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[0,1,0,1,0,1,6,7,2,3,2,3,4,5,4,5] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm8 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm4[5,u],zero,xmm4[6,u],zero,xmm4[7,u],zero,xmm4[8,u],zero,xmm4[9,u],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[5],zero,xmm7[u,6],zero,xmm7[u,7],zero,xmm7[u,8],zero,xmm7[u,9],zero,xmm7[u,10] -; AVX2-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,u,7,6,u,9,8,u,11,10,u,13,12,u,15,14] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,3,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,10,11,12,13,12,13,8,9,14,15,14,15,14,15] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,10,11,u,u,12,13,12,13,u,u,14,15,14,15,u,u,16,17,16,17,u,u,18,19,18,19,u,u,20,21] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,u,u,12,13,12,13,u,u,14,15,14,15,u,u,16,17,16,17,u,u,18,19,18,19,u,u,20,21,20,21] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> -; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,11,u,u,12,u,u,13,u,u,14,u,u,15,u,u,16,u,u,17,u,u,18,u,u,19,u,u,20,u,u] -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) -; AVX2-NEXT: vmovdqu %ymm3, 64(%rdi) -; AVX2-NEXT: vmovdqu %ymm8, (%rdi) +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm3, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_store_vf32_i8_stride3: ; AVX512: # BB#0: -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[10,11,u,u,12,13,12,13,u,u,14,15,14,15,u,u,16,17,16,17,u,u,18,19,18,19,u,u,20,21,20,21] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,11,u,u,12,u,u,13,u,u,14,u,u,15,u,u,16,u,u,17,u,u,18,u,u,19,u,u,20,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[10,11,10,11,u,u,12,13,12,13,u,u,14,15,14,15,u,u,16,17,16,17,u,u,18,19,18,19,u,u,20,21] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; AVX512-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = <128,u,6,128,u,7,128,u,8,128,u,9,128,u,10,128> -; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <5,u,128,6,u,128,7,u,128,8,u,128,9,u,128,10> -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,u,2,3,u,4,5,u,6,7,u,8,9,u,10] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[0,1,0,1,0,1,6,7,2,3,2,3,4,5,4,5] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,7] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; AVX512-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm4 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm5 -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,u,8,9,u,10,11,u,12,13,u,14,15,u] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,10,11,12,13,12,13,12,13,10,11,14,15,14,15] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] +; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] +; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi) -; AVX512-NEXT: vmovdqu32 %zmm3, (%rdi) +; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> Index: test/Transforms/InterleavedAccess/X86/interleavedStore.ll =================================================================== --- test/Transforms/InterleavedAccess/X86/interleavedStore.ll +++ test/Transforms/InterleavedAccess/X86/interleavedStore.ll @@ -105,8 +105,24 @@ ; CHECK-LABEL: @interleaved_store_vf16_i8_stride3( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <32 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <48 x i32> -; CHECK-NEXT: store <48 x i8> [[INTERLEAVED_VEC]], <48 x i8>* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <32 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> [[TMP18]], <48 x i32> +; CHECK-NEXT: store <48 x i8> [[TMP19]], <48 x i8>* [[P:%.*]], align 1 ; CHECK-NEXT: ret void ; %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> @@ -120,8 +136,24 @@ ; CHECK-LABEL: @interleaved_store_vf32_i8_stride3( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]], <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[C:%.*]], <32 x i8> undef, <64 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <96 x i32> -; CHECK-NEXT: store <96 x i8> [[INTERLEAVED_VEC]], <96 x i8>* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <32 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> [[TMP5]], <32 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> [[TMP6]], <32 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> [[TMP7]], <32 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> [[TMP9]], <32 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <32 x i8> [[TMP9]], <32 x i8> [[TMP10]], <32 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP10]], <32 x i8> [[TMP8]], <32 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP11]], <32 x i8> [[TMP12]], <32 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> [[TMP11]], <32 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> [[TMP13]], <32 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> [[TMP15]], <64 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> undef, <64 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> [[TMP18]], <96 x i32> +; CHECK-NEXT: store <96 x i8> [[TMP19]], <96 x i8>* [[P:%.*]], align 1 ; CHECK-NEXT: ret void ; %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32>