Index: lib/Target/X86/X86InterleavedAccess.cpp =================================================================== --- lib/Target/X86/X86InterleavedAccess.cpp +++ lib/Target/X86/X86InterleavedAccess.cpp @@ -72,6 +72,9 @@ void interleave8bitStride4(ArrayRef InputVectors, SmallVectorImpl &TransposedMatrix, unsigned NumSubVecElems); + void deinterleave8bitStride3(ArrayRef InputVectors, + SmallVectorImpl &TransposedMatrix, + unsigned NumSubVecElems); public: /// In order to form an interleaved access group X86InterleavedAccessGroup @@ -104,10 +107,13 @@ unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); unsigned WideInstSize; - // Currently, lowering is supported for the following vectors with stride 4: - // 1. Store and load of 4-element vectors of 64 bits on AVX. - // 2. Store of 16/32-element vectors of 8 bits on AVX. - if (!Subtarget.hasAVX() || Factor != 4) + // Currently, lowering is supported for the following vectors: + // Stride 4: + // 1. Store and load of 4-element vectors of 64 bits on AVX. + // 2. Store of 16/32-element vectors of 8 bits on AVX. + // Stride 3: + // 1. Load of 8/16/32-element vecotrs of 8 bits on AVX. + if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3)) return false; if (isa(Inst)) { @@ -117,13 +123,17 @@ // We support shuffle represents stride 4 for byte type with size of // WideInstSize. - if (ShuffleElemSize == 64 && WideInstSize == 1024) + if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4) return true; - if (ShuffleElemSize == 8 && isa(Inst) && + if (ShuffleElemSize == 8 && isa(Inst) && Factor == 4 && (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024)) return true; + if (ShuffleElemSize == 8 && isa(Inst) && Factor == 3 && + (WideInstSize == 192 || WideInstSize == 384 || WideInstSize == 768)) + return true; + return false; } @@ -158,11 +168,20 @@ // Decompose the load instruction. LoadInst *LI = cast(VecInst); Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace()); - Value *VecBasePtr = - Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); - + Value *VecBasePtr; + unsigned int NumLoads = NumSubVectors; + // In the case of stride 3 with a vector of 32 elements load the information + // in the following way: + // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1] + if (DL.getTypeSizeInBits(VecTy) == 768) { + Type *VecTran = + VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo(); + VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran); + NumLoads = NumSubVectors * 2; + } else + VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); // Generate N loads of T type. - for (unsigned i = 0; i < NumSubVectors; i++) { + for (unsigned i = 0; i < NumLoads; i++) { // TODO: Support inbounds GEP. Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = @@ -318,6 +337,151 @@ TransposedMatrix[3] = Builder.CreateShuffleVector(Low1, High1, ConcatHigh); } +// createShuffleStride returns shuffle mask of size N. +// The shuffle pattern is as following : +// {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane), +// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),..., +// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)} +// Where Lane is the # of lanes in a register: +// VectorSize = 128 => Lane = 1 +// VectorSize = 256 => Lane = 2 +// For example shuffle pattern for VF 16 register size 256 -> lanes = 2 +// {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>} +static void createShuffleStride(MVT VT, int Stride, + SmallVectorImpl &Mask) { + int VectorSize = VT.getSizeInBits(); + int VF = VT.getVectorNumElements(); + int LaneCount = std::max(VectorSize / 128, 1); + for (int Lane = 0; Lane < LaneCount; Lane++) + for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i) + Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane); +} + +// setGroupSize sets 'SizeInfo' to the size(number of elements) of group +// inside mask a shuffleMask. A mask contains exactly 3 groups, where +// each group is a monotonically increasing sequence with stride 3. +// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2} +static void setGroupSize(MVT VT, SmallVectorImpl &SizeInfo) { + int VectorSize = VT.getSizeInBits(); + int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1); + for (int i = 0, FirstGroupElement = 0; i < 3; i++) { + int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0); + SizeInfo.push_back(GroupSize); + FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF; + } +} + +// DecodePALIGNRMask returns the shuffle mask of vpalign instruction. +// vpalign works according to lanes +// Where Lane is the # of lanes in a register: +// VectorWide = 128 => Lane = 1 +// VectorWide = 256 => Lane = 2 +// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}. +// For Lane = 2 shuffle pattern is: +// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}. +// Imm variable sets the offset amount. The result of the +// function is stored inside ShuffleMask vector and it built as described in +// the begin of the description. AlignDirection is a boolean that indecat the +// direction of the alignment. (false - align to the "right" side while true - +// align to the "left" side) +static void DecodePALIGNRMask(MVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask, + bool AlignDirection = true, bool Unary = false) { + + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1); + unsigned NumLaneElts = NumElts / NumLanes; + + Imm = AlignDirection ? Imm : (NumLaneElts - Imm); + unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8); + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + unsigned Base = i + Offset; + // if i+offset is out of this lane then we actually need the other source + // If Unary the other source is the first source. + if (Base >= NumLaneElts) + Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts; + ShuffleMask.push_back(Base + l); + } + } +} + +void X86InterleavedAccessGroup::deinterleave8bitStride3( + ArrayRef InVec, SmallVectorImpl &TransposedMatrix, + unsigned VecElems) { + + // Example: Assuming we start from the following vectors: + // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2 + // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5 + // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7 + + TransposedMatrix.resize(3); + SmallVector Concat; + SmallVector VPShuf; + SmallVector VPAlign[2]; + SmallVector VPAlign2; + SmallVector VPAlign3; + SmallVector GroupSize; + Value *Vec[3], *TempVector[3]; + + MVT VT = MVT::getVT(Shuffles[0]->getType()); + + for (unsigned i = 0; i < VecElems && VecElems == 32; ++i) + Concat.push_back(i); + + createShuffleStride(VT, 3, VPShuf); + setGroupSize(VT, GroupSize); + + for (int i = 0; i < 2; i++) + DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false); + + DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true); + DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true); + + for (int i = 0; i < 3; i++) + Vec[i] = VecElems == 32 + ? Builder.CreateShuffleVector(InVec[i], InVec[i + 3], Concat) + : InVec[i]; + + // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1 + // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4 + // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 + + for (int i = 0; i < 3; i++) + Vec[i] = Builder.CreateShuffleVector( + Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf); + + // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2 + // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5 + // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7 + + for (int i = 0; i < 3; i++) + TempVector[i] = + Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]); + + // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2 + // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 + // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 + + for (int i = 0; i < 3; i++) + Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i], + VPAlign[1]); + + // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7 + // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 + // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7 + + Value *TempVec = Builder.CreateShuffleVector( + Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3); + TransposedMatrix[0] = Builder.CreateShuffleVector( + Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2); + TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec; + TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2]; + + return; +} + void X86InterleavedAccessGroup::transpose_4x4( ArrayRef Matrix, SmallVectorImpl &TransposedMatrix) { @@ -360,10 +524,25 @@ // Try to generate target-sized register(/instruction). decompose(Inst, Factor, ShuffleTy, DecomposedVectors); + Type *ShuffleEltTy = Inst->getType(); + unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor; // Perform matrix-transposition in order to compute interleaved // results by generating some sort of (optimized) target-specific // instructions. - transpose_4x4(DecomposedVectors, TransposedVectors); + + switch (NumSubVecElems) { + default: + return false; + case 4: + transpose_4x4(DecomposedVectors, TransposedVectors); + break; + case 8: + case 16: + case 32: + deinterleave8bitStride3(DecomposedVectors, TransposedVectors, + NumSubVecElems); + break; + } // Now replace the unoptimized-interleaved-vectors with the // transposed-interleaved vectors. Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -965,172 +965,72 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ ; AVX1-LABEL: interleaved_load_vf32_i8_stride3: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm4[1,4,7,10,13] -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero -; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-NEXT: vandnps %ymm2, %ymm8, %ymm10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm1[u,u,u,u,u] -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,4,7,10,13] -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-NEXT: vandps %ymm8, %ymm2, %ymm2 -; AVX1-NEXT: vorps %ymm10, %ymm2, %ymm8 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vandnps %ymm2, %ymm10, %ymm11 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm5[0,3,6,9,12,15,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,5,8,11,14] -; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX1-NEXT: vandps %ymm10, %ymm2, %ymm2 -; AVX1-NEXT: vorps %ymm11, %ymm2, %ymm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm5[1,4,7,10,13,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15] +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5 +; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm10, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpaddb %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: interleaved_load_vf32_i8_stride3: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] -; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14] -; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u> -; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15] -; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15] -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: interleaved_load_vf32_i8_stride3: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> -; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] -; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero -; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5 -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14] -; AVX512-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u> -; AVX512-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15] -; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15] -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpaddb %ymm0, %ymm5, %ymm0 -; AVX512-NEXT: vpaddb %ymm0, %ymm4, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: interleaved_load_vf32_i8_stride3: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 +; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] +; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] +; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] +; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] +; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 +; AVX-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %wide.vec = load <96 x i8>, <96 x i8>* %ptr %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> @@ -1143,87 +1043,47 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){ ; AVX1-LABEL: interleaved_load_vf16_i8_stride3: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm2[2,5,8,11,14,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1,4,7,10,13] -; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[0,3,6,9,12,15,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[2,5,8,11,14] -; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: interleaved_load_vf16_i8_stride3: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm2[1,4,7,10,13] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] -; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u> -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: interleaved_load_vf16_i8_stride3: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u> -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm1 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14],zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm1[1,4,7,10,13] -; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm3 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15],zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] -; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u> -; AVX512-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15] -; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddb %xmm0, %xmm3, %xmm0 -; AVX512-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: interleaved_load_vf16_i8_stride3: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %wide.vec = load <48 x i8>, <48 x i8>* %ptr %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> @@ -1236,38 +1096,42 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ ; AVX1-LABEL: interleaved_load_vf8_i8_stride3: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] -; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,3,255,6,255,1,255,4,255,7,255,2,255,5,255] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0] +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX-LABEL: interleaved_load_vf8_i8_stride3: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,3,255,6,255,1,255,4,255,7,255,2,255,5,255] +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %wide.vec = load <24 x i8>, <24 x i8>* %ptr %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> Index: test/Transforms/InterleavedAccess/X86/interleavedLoad.ll =================================================================== --- test/Transforms/InterleavedAccess/X86/interleavedLoad.ll +++ test/Transforms/InterleavedAccess/X86/interleavedLoad.ll @@ -4,12 +4,35 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ ; AVX2-LABEL: @interleaved_load_vf32_i8_stride3( -; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <96 x i8>, <96 x i8>* [[PTR:%.*]] -; AVX2-NEXT: [[V1:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> -; AVX2-NEXT: [[V2:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> -; AVX2-NEXT: [[V3:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> -; AVX2-NEXT: [[ADD1:%.*]] = add <32 x i8> [[V1]], [[V2]] -; AVX2-NEXT: [[ADD2:%.*]] = add <32 x i8> [[V3]], [[ADD1]] +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <96 x i8>* [[PTR:%.*]] to <16 x i8>* +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3 +; AVX2-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 4 +; AVX2-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 5 +; AVX2-NEXT: [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]] +; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> +; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> +; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> undef, <32 x i32> +; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> undef, <32 x i32> +; AVX2-NEXT: [[TMP19:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> undef, <32 x i32> +; AVX2-NEXT: [[TMP20:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> [[TMP17]], <32 x i32> +; AVX2-NEXT: [[TMP21:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> [[TMP18]], <32 x i32> +; AVX2-NEXT: [[TMP22:%.*]] = shufflevector <32 x i8> [[TMP18]], <32 x i8> [[TMP19]], <32 x i32> +; AVX2-NEXT: [[TMP23:%.*]] = shufflevector <32 x i8> [[TMP21]], <32 x i8> [[TMP20]], <32 x i32> +; AVX2-NEXT: [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP22]], <32 x i8> [[TMP21]], <32 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP20]], <32 x i8> [[TMP22]], <32 x i32> +; AVX2-NEXT: [[TMP26:%.*]] = shufflevector <32 x i8> [[TMP24]], <32 x i8> undef, <32 x i32> +; AVX2-NEXT: [[TMP27:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> undef, <32 x i32> +; AVX2-NEXT: [[ADD1:%.*]] = add <32 x i8> [[TMP27]], [[TMP26]] +; AVX2-NEXT: [[ADD2:%.*]] = add <32 x i8> [[TMP25]], [[ADD1]] ; AVX2-NEXT: ret <32 x i8> [[ADD2]] ; %wide.vec = load <96 x i8>, <96 x i8>* %ptr @@ -23,12 +46,26 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){ ; AVX2-LABEL: @interleaved_load_vf16_i8_stride3( -; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <48 x i8>, <48 x i8>* [[PTR:%.*]] -; AVX2-NEXT: [[V1:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> -; AVX2-NEXT: [[V2:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> -; AVX2-NEXT: [[V3:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> -; AVX2-NEXT: [[ADD1:%.*]] = add <16 x i8> [[V1]], [[V2]] -; AVX2-NEXT: [[ADD2:%.*]] = add <16 x i8> [[V3]], [[ADD1]] +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <48 x i8>* [[PTR:%.*]] to <16 x i8>* +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]] +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> +; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP8]], <16 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> +; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP11]], <16 x i32> +; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP12]], <16 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> [[TMP13]], <16 x i32> +; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> undef, <16 x i32> +; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> undef, <16 x i32> +; AVX2-NEXT: [[ADD1:%.*]] = add <16 x i8> [[TMP18]], [[TMP17]] +; AVX2-NEXT: [[ADD2:%.*]] = add <16 x i8> [[TMP16]], [[ADD1]] ; AVX2-NEXT: ret <16 x i8> [[ADD2]] ; %wide.vec = load <48 x i8>, <48 x i8>* %ptr @@ -42,12 +79,26 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ ; AVX2-LABEL: @interleaved_load_vf8_i8_stride3( -; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <24 x i8>, <24 x i8>* [[PTR:%.*]] -; AVX2-NEXT: [[V1:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> -; AVX2-NEXT: [[V2:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> -; AVX2-NEXT: [[V3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> -; AVX2-NEXT: [[ADD1:%.*]] = add <8 x i8> [[V1]], [[V2]] -; AVX2-NEXT: [[ADD2:%.*]] = add <8 x i8> [[V3]], [[ADD1]] +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <24 x i8>* [[PTR:%.*]] to <8 x i8>* +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr <8 x i8>, <8 x i8>* [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr <8 x i8>, <8 x i8>* [[TMP1]], i32 1 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr <8 x i8>, <8 x i8>* [[TMP1]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP6]] +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> undef, <8 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> undef, <8 x i32> +; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> undef, <8 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP8]], <8 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <8 x i32> +; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> [[TMP10]], <8 x i32> +; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <8 x i8> [[TMP12]], <8 x i8> [[TMP11]], <8 x i32> +; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP12]], <8 x i32> +; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP13]], <8 x i32> +; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> undef, <8 x i32> +; AVX2-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP14]], <8 x i8> undef, <8 x i32> +; AVX2-NEXT: [[ADD1:%.*]] = add <8 x i8> [[TMP18]], [[TMP16]] +; AVX2-NEXT: [[ADD2:%.*]] = add <8 x i8> [[TMP17]], [[ADD1]] ; AVX2-NEXT: ret <8 x i8> [[ADD2]] ; %wide.vec = load <24 x i8>, <24 x i8>* %ptr