Index: lib/Target/X86/X86InterleavedAccess.cpp =================================================================== --- lib/Target/X86/X86InterleavedAccess.cpp +++ lib/Target/X86/X86InterleavedAccess.cpp @@ -72,6 +72,9 @@ void interleave8bitStride4(ArrayRef InputVectors, SmallVectorImpl &TransposedMatrix, unsigned NumSubVecElems); + void interleave8bitStride3(ArrayRef InputVectors, + SmallVectorImpl &TransposedMatrix, + unsigned NumSubVecElems); public: /// In order to form an interleaved access group X86InterleavedAccessGroup @@ -107,7 +110,7 @@ // Currently, lowering is supported for the following vectors with stride 4: // 1. Store and load of 4-element vectors of 64 bits on AVX. // 2. Store of 16/32-element vectors of 8 bits on AVX. - if (!Subtarget.hasAVX() || Factor != 4) + if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3)) return false; if (isa(Inst)) { @@ -117,13 +120,17 @@ // We support shuffle represents stride 4 for byte type with size of // WideInstSize. - if (ShuffleElemSize == 64 && WideInstSize == 1024) + if (ShuffleElemSize == 64 && WideInstSize == 1024) return true; if (ShuffleElemSize == 8 && isa(Inst) && (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024)) return true; + if (ShuffleElemSize == 8 && isa(Inst) && + (WideInstSize == 192 || WideInstSize == 384 || WideInstSize == 768)) + return true; + return false; } @@ -158,11 +165,17 @@ // Decompose the load instruction. LoadInst *LI = cast(VecInst); Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace()); - Value *VecBasePtr = - Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); - + Value *VecBasePtr; + unsigned int NumOfLoad = NumSubVectors; + if (DL.getTypeSizeInBits(VecTy) == 768) { + Type *VecTran = + VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo(); + VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran); + NumOfLoad = NumSubVectors * 2; + } else + VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); // Generate N loads of T type. - for (unsigned i = 0; i < NumSubVectors; i++) { + for (unsigned i = 0; i < NumOfLoad; i++) { // TODO: Support inbounds GEP. Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = @@ -320,6 +333,156 @@ Builder.CreateShuffleVector(Low1, High1, MaskConcatHigh); } +// createVPShuftkMask returns shuffle mask of size N. +// The shuffle pattern is as following : +// {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane), +// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),..., +// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)} +// Where Lane is the # of lanes in a register: +// VectorWide = 128 => Lane = 1 +// VectorWide = 256 => Lane = 2 +// For example shuffle pattern for VF 16 register size 256 -> lanes = 2 +// {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>} +static void createVPShuftkMask(int VF, int Stride, int VectorWide, + SmallVectorImpl &Mask) { + int Lane = (VectorWide / 128 > 0) ? VectorWide / 128 : 1; + for (int LaneCount = 0; LaneCount < Lane; LaneCount++) { + for (int i = 0; i < VF / Lane; ++i) { + Mask.push_back((i * Stride) % (VF / Lane) + (VF / Lane) * LaneCount); + } + } +} + +// groupShiftAmount returns the size of group. +// There are Stride different groups. Each group can be calculated according to +// mask[index]%Stride. Where '%' is a modulo operator. We're interested of the +// wide of the group. The Group defines the interesting group. For example, the +// interested group is marked between '[...]' +// groupShiftAmount(3,0,{0|3|6|1|4|7|[2|5]}) -> 2 +// groupShiftAmount(3,1,{0|3|6|[1|4|7]|2|5}) -> 3 +static int groupShiftAmount(unsigned Stride, int Group, + SmallVectorImpl &Mask) { + int Index, Count = 0, GroupWide = 0; + int GroupNum = Mask[Mask.size() - 1] % Stride; + + for (Index = Mask.size() - 1; Index >= 0; Index--) { + if (GroupNum != Mask[Index] % Stride) { + GroupNum = Mask[Index] % Stride; + Count++; + } + if (Count == Group) + break; + } + + for (Index; Index >= 0; Index--) { + if (GroupNum != Mask[Index] % Stride) + return GroupWide; + GroupWide++; + } + return GroupWide; +} + +// createAlignMask returns the shuffle mask of vpalign instruction. +// Vpaling works according to lanes +// Where Lane is the # of lanes in a register: +// VectorWide = 128 => Lane = 1 +// VectorWide = 256 => Lane = 2 +// For Lane = 2 shuffle pattern is {DiffToJump,...,VF/2-1,VF/2,...,VF-1}. +// For Lane = 1 shuffle pattern is {DiffToJump,...,VF}. +static void createAlignMask(int VF, int Align, + SmallVectorImpl &Mask, int VectorWide, + bool AlignBegin = false) { + int Lane = (VectorWide / 128 > 0) ? (VectorWide / 128) : 1; + int DiffToJump = AlignBegin ? Align : (VF / Lane - Align); + + for (int i = 0 ; i < (VF / Lane); i++) { + int RealIndex = i + DiffToJump; + if (Lane > 1 && RealIndex > (VF / Lane) - 1) + RealIndex += VF / Lane; + Mask.push_back(RealIndex); + } + + for (int LaneCount = 1; LaneCount < Lane; LaneCount++) { + for (int i = 0; i < (VF / Lane); i++) + Mask.push_back(Mask[i] + (VF / Lane) * LaneCount); + } +} + +void X86InterleavedAccessGroup::interleave8bitStride3( + ArrayRef InVec, + SmallVectorImpl &TransposedMatrix, unsigned VecElems) { + + // Example: Assuming we start from the following vectors: + // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2 + // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5 + // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7 + + TransposedMatrix.resize(3); + SmallVector Concat; + SmallVector VPShuf; + SmallVector VPAlign[2]; + SmallVector VPAlign2; + SmallVector VPAlign3; + Value *Vec[3], *TempVector[3]; + + int Offset[2]; + int VectorWide = VecElems * 8; + + for (unsigned i = 0; i < VecElems; i++) + Concat.push_back(i); + + createVPShuftkMask(VecElems, 3, VectorWide, VPShuf); + + for (int i = 0; i < 2; i++) { + Offset[i] = groupShiftAmount(3, i, VPShuf); + createAlignMask(VecElems, Offset[i], VPAlign[i], VectorWide); + } + + createAlignMask(VecElems, Offset[0] + Offset[1], VPAlign2, VectorWide, true); + createAlignMask(VecElems, Offset[1], VPAlign3, VectorWide, true); + + for (int i = 0; i < 3; i++) + Vec[i] = VecElems == 32 + ? Builder.CreateShuffleVector(InVec[0], InVec[i + 3], Concat) + : InVec[i]; + + // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1 + // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4 + // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 + + Type *Ty = Vec[0]->getType(); + for (int i = 0; i < 3; i++) + Vec[i] = Builder.CreateShuffleVector(Vec[i], UndefValue::get(Ty), VPShuf); + + + // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2 + // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5 + // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7 + + for (int i = 0; i < 3; i++) + TempVector[i] = + Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]); + + // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2 + // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 + // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 + + for (int i = 0; i < 3; i++) + Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i], + VPAlign[1]); + + // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7 + // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 + // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7 + + Value *TempVec = Builder.CreateShuffleVector(Vec[1], Vec[1], VPAlign3); + TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], Vec[0], VPAlign2); + TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec; + TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2]; + + return; +} + void X86InterleavedAccessGroup::transpose_4x4( ArrayRef Matrix, SmallVectorImpl &TransposedMatrix) { @@ -362,10 +525,24 @@ // Try to generate target-sized register(/instruction). decompose(Inst, Factor, ShuffleTy, DecomposedVectors); + Type *ShuffleEltTy = Inst->getType(); + unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor; // Perform matrix-transposition in order to compute interleaved // results by generating some sort of (optimized) target-specific // instructions. - transpose_4x4(DecomposedVectors, TransposedVectors); + + switch (NumSubVecElems) { + default: + return false; + case 4: + transpose_4x4(DecomposedVectors, TransposedVectors); + break; + case 8: + case 16: + case 32: + interleave8bitStride3(DecomposedVectors, TransposedVectors, NumSubVecElems); + break; + } // Now replace the unoptimized-interleaved-vectors with the // transposed-interleaved vectors. Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -965,172 +965,63 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ ; AVX1-LABEL: interleaved_load_vf32_i8_stride3: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm4[1,4,7,10,13] -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero -; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-NEXT: vandnps %ymm2, %ymm8, %ymm10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm1[u,u,u,u,u] -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,4,7,10,13] -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-NEXT: vandps %ymm8, %ymm2, %ymm2 -; AVX1-NEXT: vorps %ymm10, %ymm2, %ymm8 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vandnps %ymm2, %ymm10, %ymm11 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm5[0,3,6,9,12,15,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,5,8,11,14] -; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX1-NEXT: vandps %ymm10, %ymm2, %ymm2 -; AVX1-NEXT: vorps %ymm11, %ymm2, %ymm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm5[1,4,7,10,13,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm10, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm6, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: interleaved_load_vf32_i8_stride3: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] -; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14] -; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u> -; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15] -; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15] -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: interleaved_load_vf32_i8_stride3: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> -; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] -; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero -; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5 -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14] -; AVX512-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u> -; AVX512-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15] -; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15] -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpaddb %ymm0, %ymm5, %ymm0 -; AVX512-NEXT: vpaddb %ymm0, %ymm4, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: interleaved_load_vf32_i8_stride3: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm1 +; AVX-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm2 +; AVX-NEXT: vinserti128 $1, 80(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] +; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] +; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] +; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] +; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq %wide.vec = load <96 x i8>, <96 x i8>* %ptr %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> @@ -1143,87 +1034,47 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){ ; AVX1-LABEL: interleaved_load_vf16_i8_stride3: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm2[2,5,8,11,14,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1,4,7,10,13] -; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[0,3,6,9,12,15,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[2,5,8,11,14] -; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: interleaved_load_vf16_i8_stride3: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm2[1,4,7,10,13] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] -; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u> -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: interleaved_load_vf16_i8_stride3: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u> -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm1 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14],zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm1[1,4,7,10,13] -; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm3 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] -; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u> -; AVX512-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15] -; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddb %xmm0, %xmm3, %xmm0 -; AVX512-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: interleaved_load_vf16_i8_stride3: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %wide.vec = load <48 x i8>, <48 x i8>* %ptr %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> @@ -1236,38 +1087,42 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ ; AVX1-LABEL: interleaved_load_vf8_i8_stride3: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] -; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,3,255,6,255,1,255,4,255,7,255,2,255,5,255] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0] +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX-LABEL: interleaved_load_vf8_i8_stride3: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,3,255,6,255,1,255,4,255,7,255,2,255,5,255] +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %wide.vec = load <24 x i8>, <24 x i8>* %ptr %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> Index: test/Transforms/InterleavedAccess/X86/interleavedLoad.ll =================================================================== --- test/Transforms/InterleavedAccess/X86/interleavedLoad.ll +++ test/Transforms/InterleavedAccess/X86/interleavedLoad.ll @@ -4,12 +4,35 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ ; CHECK-LABEL: @interleaved_load_vf32_i8_stride3( -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <96 x i8>, <96 x i8>* [[PTR:%.*]] -; CHECK-NEXT: [[V1:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> -; CHECK-NEXT: [[V2:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> -; CHECK-NEXT: [[V3:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> -; CHECK-NEXT: [[ADD1:%.*]] = add <32 x i8> [[V1]], [[V2]] -; CHECK-NEXT: [[ADD2:%.*]] = add <32 x i8> [[V3]], [[ADD1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <96 x i8>* [[PTR:%.*]] to <16 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 5 +; CHECK-NEXT: [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP11]], <32 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP13]], <32 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> [[TMP17]], <32 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> [[TMP18]], <32 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <32 x i8> [[TMP18]], <32 x i8> [[TMP19]], <32 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <32 x i8> [[TMP21]], <32 x i8> [[TMP20]], <32 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP22]], <32 x i8> [[TMP21]], <32 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP20]], <32 x i8> [[TMP22]], <32 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <32 x i8> [[TMP24]], <32 x i8> [[TMP24]], <32 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> [[TMP23]], <32 x i32> +; CHECK-NEXT: [[ADD1:%.*]] = add <32 x i8> [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[ADD2:%.*]] = add <32 x i8> [[TMP25]], [[ADD1]] ; CHECK-NEXT: ret <32 x i8> [[ADD2]] ; %wide.vec = load <96 x i8>, <96 x i8>* %ptr @@ -23,12 +46,26 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){ ; CHECK-LABEL: @interleaved_load_vf16_i8_stride3( -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <48 x i8>, <48 x i8>* [[PTR:%.*]] -; CHECK-NEXT: [[V1:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> -; CHECK-NEXT: [[V2:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> -; CHECK-NEXT: [[V3:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> -; CHECK-NEXT: [[ADD1:%.*]] = add <16 x i8> [[V1]], [[V2]] -; CHECK-NEXT: [[ADD2:%.*]] = add <16 x i8> [[V3]], [[ADD1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <48 x i8>* [[PTR:%.*]] to <16 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> [[TMP13]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[ADD1:%.*]] = add <16 x i8> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[ADD2:%.*]] = add <16 x i8> [[TMP16]], [[ADD1]] ; CHECK-NEXT: ret <16 x i8> [[ADD2]] ; %wide.vec = load <48 x i8>, <48 x i8>* %ptr @@ -42,12 +79,26 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ ; CHECK-LABEL: @interleaved_load_vf8_i8_stride3( -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i8>, <24 x i8>* [[PTR:%.*]] -; CHECK-NEXT: [[V1:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> -; CHECK-NEXT: [[V2:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> -; CHECK-NEXT: [[V3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> -; CHECK-NEXT: [[ADD1:%.*]] = add <8 x i8> [[V1]], [[V2]] -; CHECK-NEXT: [[ADD2:%.*]] = add <8 x i8> [[V3]], [[ADD1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <24 x i8>* [[PTR:%.*]] to <8 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr <8 x i8>, <8 x i8>* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <8 x i8>, <8 x i8>* [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> [[TMP10]], <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i8> [[TMP12]], <8 x i8> [[TMP11]], <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP12]], <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP13]], <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP15]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP14]], <8 x i8> [[TMP14]], <8 x i32> +; CHECK-NEXT: [[ADD1:%.*]] = add <8 x i8> [[TMP18]], [[TMP16]] +; CHECK-NEXT: [[ADD2:%.*]] = add <8 x i8> [[TMP17]], [[ADD1]] ; CHECK-NEXT: ret <8 x i8> [[ADD2]] ; %wide.vec = load <24 x i8>, <24 x i8>* %ptr