Index: include/llvm/Analysis/VectorUtils.h =================================================================== --- include/llvm/Analysis/VectorUtils.h +++ include/llvm/Analysis/VectorUtils.h @@ -150,8 +150,12 @@ /// For example, the mask for Start = 0, Stride = 2, and VF = 4 is: /// /// <0, 2, 4, 6> +/// Having NumVecs non-zero allows to concatenate contiguous stride mask. +/// For example, NumVecs = 2 with the above specification will generate: +/// <0, 2, 4, 6, 1, 3, 5, 7> Constant *createStrideMask(IRBuilder<> &Builder, unsigned Start, - unsigned Stride, unsigned VF); + unsigned Stride, unsigned VF, + unsigned NumVecs = 1); /// \brief Create a sequential shuffle mask. /// @@ -176,6 +180,18 @@ /// elements, it will be padded with undefs. Value *concatenateVectors(IRBuilder<> &Builder, ArrayRef Vecs); +/// \brief Concatenate two vectors; keeps the shuffle mask as it is. +/// +/// This function concatenates two vectors having the same element type. +/// If the second vector has fewer elements than the first, it is padded with +/// undefs. +Value *concatenateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2); + +/// \brief Extracts a vector of \p NumElements (with the same element type of +/// V) from the \p BeginIndex of \p V. +Value *extractVector(IRBuilder<> &Builder, Value *V, unsigned BeginIndex, + unsigned NumElements); + } // llvm namespace #endif Index: lib/Analysis/VectorUtils.cpp =================================================================== --- lib/Analysis/VectorUtils.cpp +++ lib/Analysis/VectorUtils.cpp @@ -501,12 +501,15 @@ } Constant *llvm::createStrideMask(IRBuilder<> &Builder, unsigned Start, - unsigned Stride, unsigned VF) { + unsigned Stride, unsigned VF, + unsigned NumVecs) { SmallVector Mask; - for (unsigned i = 0; i < VF; i++) - Mask.push_back(Builder.getInt32(Start + i * Stride)); + for (unsigned j = 0; j < NumVecs; ++Start,++j) + for (unsigned i = 0; i < VF; i++) + Mask.push_back(Builder.getInt32(Start + i * Stride)); return ConstantVector::get(Mask); + } Constant *llvm::createSequentialMask(IRBuilder<> &Builder, unsigned Start, @@ -522,10 +525,7 @@ return ConstantVector::get(Mask); } -/// A helper function for concatenating vectors. This function concatenates two -/// vectors having the same element type. If the second vector has fewer -/// elements than the first, it is padded with undefs. -static Value *concatenateTwoVectors(IRBuilder<> &Builder, Value *V1, +Value *llvm::concatenateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2) { VectorType *VecTy1 = dyn_cast(V1->getType()); VectorType *VecTy2 = dyn_cast(V2->getType()); @@ -574,3 +574,22 @@ return ResList[0]; } + +Value *llvm::extractVector(IRBuilder<> &Builder, Value *V, + unsigned BeginIndex, unsigned NumElements) { + VectorType *VecTy = cast(V->getType()); + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + + if (NumElements == VecTy->getNumElements()) + return V; + + if (NumElements == 1) { + V = Builder.CreateExtractElement(V, Builder.getInt32(BeginIndex)); + return V; + } + + V = Builder.CreateShuffleVector( + V, UndefValue::get(V->getType()), + createSequentialMask(Builder, BeginIndex, NumElements, 0)); + return V; +} Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -1450,16 +1450,20 @@ /// Generate unpacklo/unpackhi shuffle mask. template void createUnpackShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Lo, - bool Unary) { - assert(Mask.empty() && "Expected an empty shuffle mask vector"); + bool Unary, unsigned VecLen = 128, + unsigned NumEltsToUnpack = 1) { int NumElts = VT.getVectorNumElements(); - int NumEltsInLane = 128 / VT.getScalarSizeInBits(); - for (int i = 0; i < NumElts; ++i) { + int NumEltsInLane = VecLen / VT.getScalarSizeInBits(); + + for (int i = 0; i < NumElts / NumEltsToUnpack; ++i) { unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; - int Pos = (i % NumEltsInLane) / 2 + LaneStart; + int Pos = ((i % NumEltsInLane) / 2 + LaneStart) * NumEltsToUnpack; + Pos += (Unary ? 0 : NumElts * (i % 2)); Pos += (Lo ? 0 : NumEltsInLane / 2); Mask.push_back(Pos); + for (int j = 1; j < NumEltsToUnpack; j++) + Mask.push_back(++Pos); } } Index: lib/Target/X86/X86InterleavedAccess.cpp =================================================================== --- lib/Target/X86/X86InterleavedAccess.cpp +++ lib/Target/X86/X86InterleavedAccess.cpp @@ -20,6 +20,117 @@ using namespace llvm; namespace { +class X86InterleavedUtils { +public: + /// \brief composes vectors by splitting the group of InputVectors into two + /// groups and then merging corresponding vectors. Return the merged + /// vectors in \p MergedVectors. The Number of merged vectors are going to be + /// half the number of InputVectors. + /// E.g. For InputVectors with size 4, + /// Group1: InputVectors[0], InputVectors[1] + /// Group2: InputVectors[2], InputVectors[3] + /// + /// MergedVectors[0] = InputVectors[0], InputVectors[2] + /// MergedVectors[1] = InputVectors[1], InputVectors[3] + /// TODO: Support splitting into quarters, double quarters, etc. + static void compose(IRBuilder<> &Builder, ArrayRef InputVectors, + SmallVectorImpl &MergedVectors) { + unsigned TotalVectors = InputVectors.size(); + + assert(TotalVectors % 2 == 0 && "Unexpected number of input vectors!!!"); + + for (unsigned i = 0, e = TotalVectors / 2; i < e; ++i) + MergedVectors.push_back(concatenateTwoVectors( + Builder, InputVectors[i], InputVectors[TotalVectors / 2 + i])); + } + + /// \brief Recursively packs \p NumElemToUnpack elements from (low and high) + /// order each two vectors of \p InputVectors untill the number of packed + /// elements matches the \p TotalNumElem. Returns packed vectors + /// that are generated in the last instance of the pack() in \p OutputVectors. + /// During recursive calls it packs the elements from two low-orderd or two + /// high orderd vectors. + /// E.g. + /// NumElemToUnpack = 4 + /// TotalNumElem = 16 + /// InputVectors: { v1, v2, v3, v4} + /// + /// Ist Round: + /// PV[0] (L): v1[0-3] v2[0-3] v1[4-7] v2[4-7] + /// PV[1] (H): v1[8-11] v2[8-11] v1[12-15] v2[12-15] + /// + /// PV[2] (L): v3[0-3] v4[0-3] v3[4-7] v4[4-7] + /// PV[3] (H): v3[8-11] v4[8-11] v3[12-15] v4[12-15] + /// 2nd Round: + /// PV[0] = OutVectors[0] = (L): pv[0][0-7] pv[2][0-7] + /// PV[1] = OutVectors[1] = (H): pv[0][8-15] pv[2][8-15] + /// + /// PV[2] = OutVectors[1] = (L): pv[1][0-7] pv[3][0-7] + /// PV[3] = OutVectors[1] = (H): pv[1][0-7] pv[3][0-7] + static void pack(IRBuilder<> &Builder, SmallVectorImpl &InputVectors, + Type *Ty, unsigned NumElemToUnpack, unsigned TotalNumElem, + SmallVectorImpl &OutVectors, const DataLayout &DL) { + assert(InputVectors.size() % 2 == 0 && + "Unexpected number of InputVectors!!!"); + SmallVector LowVecs, HighVecs; + unsigned VecLen = DL.getTypeSizeInBits(Ty); + MVT VT = MVT::getVT(Ty); + for (unsigned i = 0, e = InputVectors.size(); i < e; i += 2) { + SmallVector LowMask, HighMask; + + createUnpackShuffleMask(VT, LowMask, true, false, VecLen, + NumElemToUnpack); + Value *Low = Builder.CreateShuffleVector( + InputVectors[i], InputVectors[i + 1], makeArrayRef(LowMask)); + LowVecs.push_back(Low); + + createUnpackShuffleMask(VT, HighMask, false, false, VecLen, + NumElemToUnpack); + Value *High = Builder.CreateShuffleVector( + InputVectors[i], InputVectors[i + 1], makeArrayRef(HighMask)); + HighVecs.push_back(High); + } + + OutVectors.clear(); + if ((NumElemToUnpack * 2) == TotalNumElem) { + for (unsigned i = 0, e = LowVecs.size(); i < e; ++i) { + OutVectors.push_back(LowVecs[i]); + OutVectors.push_back(HighVecs[i]); + } + } else { + // Keep all the lows and highs together + for (auto Low : LowVecs) + OutVectors.push_back(Low); + for (auto High : HighVecs) + OutVectors.push_back(High); + + pack(Builder, InputVectors = OutVectors, Ty, 2 * NumElemToUnpack, + TotalNumElem, OutVectors, DL); + } + } + + // Concatenate two ConstantDataSequentials. + static Constant *concatenate(IRBuilder<> &Builder, Constant *Mask1, + Constant *Mask2) { + const ConstantDataSequential *CDS1 = + dyn_cast(Mask1); + + const ConstantDataSequential *CDS2 = + dyn_cast(Mask2); + + assert(CDS1 && CDS2 && "Expected ConstantDataSequential!!!"); + + SmallVector Mask; + for (unsigned i = 0, e = CDS1->getNumElements(); i < e; ++i) + Mask.push_back(Builder.getInt32(CDS1->getElementAsInteger(i))); + + for (unsigned i = 0, e = CDS2->getNumElements(); i < e; ++i) + Mask.push_back(Builder.getInt32(CDS2->getElementAsInteger(i))); + + return ConstantVector::get(Mask); + } +}; + /// \brief This class holds necessary information to represent an interleaved /// access group and supports utilities to lower the group into /// X86-specific instructions/intrinsics. @@ -51,8 +162,8 @@ /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors. - void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T, - SmallVectorImpl &DecomposedVectors); + void decompose(Value *Inst, unsigned NumSubVectors, VectorType *T, + SmallVectorImpl &DecomposedVectors); /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and /// returns the transposed-vectors in \p TransposedVectors. @@ -67,10 +178,11 @@ /// Out-V1 = p2, q2, r2, s2 /// Out-V2 = p3, q3, r3, s3 /// Out-V3 = P4, q4, r4, s4 - void transpose_4x4(ArrayRef InputVectors, + void transpose_4x4(ArrayRef InputVectors, SmallVectorImpl &TransposedMatrix); - void interleave8bit_32x4(ArrayRef InputVectors, - SmallVectorImpl &TransposedMatrix); + void interleave8bit_32x4(ArrayRef InputVectors, + SmallVectorImpl &TransposedMatrix); + public: /// In order to form an interleaved access group X86InterleavedAccessGroup /// requires a wide-load instruction \p 'I', a group of interleaved-vectors @@ -93,44 +205,73 @@ /// \brief Lowers this interleaved access group into X86-specific /// instructions/intrinsics. bool lowerIntoOptimizedSequence(); + + /// \brief Lowers "load" interleaved access group into X86-specific + /// instructions/intrinsics. + void lowerLoadIntoOptimizedSequence(); + + /// \brief Lowers interleaved access group as 128bit vector. + void lower_as128(SmallVectorImpl &InterleavedVectors); }; } // end anonymous namespace bool X86InterleavedAccessGroup::isSupported() const { + if (!Subtarget.hasAVX() || Factor != 4) + return false; + VectorType *ShuffleVecTy = Shuffles[0]->getType(); Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); + unsigned ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy); unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); - unsigned SupportedNumElem = 4; - if (ShuffleElemSize == 8) - SupportedNumElem = 32; + unsigned SupportedNumElem; unsigned WideInstSize; // Currently, lowering is supported for the following vectors: - // 1. 4-element vectors of 64 bits on AVX. - // 2. 32-element vectors of 8 bits on AVX. + // 1. 64 bits: VF = 4. + // 2. 8bits: Store: VF = 32 + // Load: VF = 8, 16, 32 + + switch (ShuffleElemSize) { + default: + return false; + + case 8: + if (isa(Inst)) { + if (Indices.size() < 2) + return false; + WideInstSize = DL.getTypeSizeInBits(Inst->getType()); + + for (unsigned i = 3; i < 6; i++) + if (ShuffleVecSize == (pow(2, i) * ShuffleElemSize) && + WideInstSize >= (Factor * ShuffleVecSize)) + return true; + return false; + } else + SupportedNumElem = 32; + break; + + case 64: + SupportedNumElem = 4; + break; + } + if (isa(Inst)) { - if (DL.getTypeSizeInBits(ShuffleVecTy) != - SupportedNumElem * ShuffleElemSize) + if (ShuffleVecSize != SupportedNumElem * ShuffleElemSize) return false; WideInstSize = DL.getTypeSizeInBits(Inst->getType()); } else WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); - if (DL.getTypeSizeInBits(ShuffleEltTy) == 8 && !isa(Inst)) - return false; - - if (!Subtarget.hasAVX() || Factor != 4 || - (ShuffleElemSize != 64 && ShuffleElemSize != 8) || - WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem)) + if (WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem)) return false; return true; } void X86InterleavedAccessGroup::decompose( - Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy, - SmallVectorImpl &DecomposedVectors) { + Value *VecInst, unsigned NumSubVectors, VectorType *SubVecTy, + SmallVectorImpl &DecomposedVectors) { assert((isa(VecInst) || isa(VecInst)) && "Expected Load or Shuffle"); @@ -148,11 +289,10 @@ // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type. for (unsigned i = 0; i < NumSubVectors; ++i) - DecomposedVectors.push_back( - cast(Builder.CreateShuffleVector( - Op0, Op1, - createSequentialMask(Builder, Indices[i], - SubVecTy->getVectorNumElements(), 0)))); + DecomposedVectors.push_back(Builder.CreateShuffleVector( + Op0, Op1, + createSequentialMask(Builder, Indices[i], + SubVecTy->getVectorNumElements(), 0))); return; } @@ -193,8 +333,7 @@ } void X86InterleavedAccessGroup::interleave8bit_32x4( - ArrayRef Matrix, - SmallVectorImpl &TransposedMatrix) { + ArrayRef Matrix, SmallVectorImpl &TransposedMatrix) { // Example: Assuming we start from the following vectors: // Matrix[0]= c0 c1 c2 c3 c4 ... c31 @@ -280,8 +419,7 @@ } void X86InterleavedAccessGroup::transpose_4x4( - ArrayRef Matrix, - SmallVectorImpl &TransposedMatrix) { + ArrayRef Matrix, SmallVectorImpl &TransposedMatrix) { assert(Matrix.size() == 4 && "Invalid matrix size"); TransposedMatrix.resize(4); @@ -310,27 +448,110 @@ TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); } -// Lowers this interleaved access group into X86-specific -// instructions/intrinsics. -bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { - SmallVector DecomposedVectors; - SmallVector TransposedVectors; +void X86InterleavedAccessGroup::lower_as128( + SmallVectorImpl &InterleavedVectors) { + SmallVector DecomposedVectors; + VectorType *ShuffleTy = Shuffles[0]->getType(); + Type *ShuffleEltTy = ShuffleTy->getVectorElementType(); + Type *WideInstTy = Inst->getType(); - if (isa(Inst)) { + // The general idea is to optimize as a 128bit vector. + VectorType *V128Ty = VectorType::get(ShuffleEltTy, 16); + unsigned PreferredNumElems = 16; + + unsigned NumElems = ShuffleTy->getVectorNumElements(); + unsigned NumSubVecs = WideInstTy->getVectorNumElements() / PreferredNumElems; + + // Step-1: Generate 128bit loads. + decompose(Inst, NumSubVecs, V128Ty, DecomposedVectors); + + // Step-2: Shuffle in the interleaved elements of a vector. + Constant *StrideMask = createStrideMask(Builder, 0, Factor, Factor, Factor); + if (NumSubVecs > Factor) { + // If the number of sub-vectors is greater than the Factor, sub-vectors + // need to be composed first before performing any other operations. + // This will reduce the number of total instructions by working on more + // elements at a time. + SmallVector InputVectors = DecomposedVectors; + DecomposedVectors.clear(); + X86InterleavedUtils::compose(Builder, InputVectors, DecomposedVectors); + StrideMask = X86InterleavedUtils::concatenate( + Builder, StrideMask, + createStrideMask(Builder, PreferredNumElems, Factor, Factor, Factor)); + } + + SmallVector ShuffledVectors; + unsigned NumVecsToShuffle = DecomposedVectors.size(); + Type *Ty = DecomposedVectors[0]->getType(); + for (unsigned i = 0; i < NumVecsToShuffle; ++i) + ShuffledVectors.push_back(Builder.CreateShuffleVector( + DecomposedVectors[i], UndefValue::get(Ty), StrideMask)); + + // Step-3: Shuffle in the interleaved elements accross vectors. + SmallVector PackedVectors; + X86InterleavedUtils::pack(Builder, ShuffledVectors, + ShuffledVectors[0]->getType(), Factor, NumElems, + PackedVectors, DL); + + // Step-4: Collect the interleaved vectors in the destination vectors. + unsigned NumPackedVecs = PackedVectors.size(); + for (unsigned i = 0; i < Factor;) { + if (NumPackedVecs < Factor) { + // We need to extract + for (unsigned j = 0; j < NumPackedVecs; ++j) + InterleavedVectors.push_back(extractVector( + Builder, PackedVectors[i / NumPackedVecs], j * NumElems, NumElems)); + i += 2; + continue; + } + + InterleavedVectors.push_back(PackedVectors[i]); + ++i; + } +} + +void X86InterleavedAccessGroup::lowerLoadIntoOptimizedSequence() { + assert(Factor == 4 && "Unexpected Factor!!!"); + + SmallVector DecomposedVectors; + SmallVector InterleavedVectors; + + VectorType *ShuffleTy = Shuffles[0]->getType(); + Type *ShuffleEltTy = ShuffleTy->getVectorElementType(); + unsigned ElemSize = DL.getTypeSizeInBits(ShuffleEltTy); + + switch (ElemSize) { + default: + assert("Unexpected element size!!!\n"); + case 8: + lower_as128(InterleavedVectors); + break; + + case 64: // Try to generate target-sized register(/instruction). decompose(Inst, Factor, ShuffleTy, DecomposedVectors); // Perform matrix-transposition in order to compute interleaved // results by generating some sort of (optimized) target-specific // instructions. - transpose_4x4(DecomposedVectors, TransposedVectors); + transpose_4x4(DecomposedVectors, InterleavedVectors); + break; + } - // Now replace the unoptimized-interleaved-vectors with the - // transposed-interleaved vectors. - for (unsigned i = 0, e = Shuffles.size(); i < e; ++i) - Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); + // Now replace the unoptimized-interleaved-vectors with the + // transposed-interleaved vectors. + for (unsigned i = 0, e = Shuffles.size(); i < e; ++i) + Shuffles[i]->replaceAllUsesWith(InterleavedVectors[Indices[i]]); +} +bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { + SmallVector DecomposedVectors; + SmallVector TransposedVectors; + VectorType *ShuffleTy = Shuffles[0]->getType(); + + if (isa(Inst)) { + lowerLoadIntoOptimizedSequence(); return true; } Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -453,3 +453,332 @@ store <64 x i8> %interleaved.vec, <64 x i8>* %p ret void } + +define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) { +; AVX1-LABEL: interleaved_load_vf8_i8_stride4: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX-LABEL: interleaved_load_vf8_i8_stride4: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16 + %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> + %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> + %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> + %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> + + %add1 = add <8 x i8> %v1, %v2 + %add2 = add <8 x i8> %v4, %v3 + %add3 = mul <8 x i8> %add1, %add2 + ret <8 x i8> %add3 +} + +define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) { +; AVX1-LABEL: interleaved_load_vf16_i8_stride4: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm1[0] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: interleaved_load_vf16_i8_stride4: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm1[0] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX3-LABEL: interleaved_load_vf16_i8_stride4: +; AVX3: # BB#0: +; AVX3-NEXT: vmovdqa (%rdi), %xmm0 +; AVX3-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX3-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX3-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX3-NEXT: vmovdqa {{.*#+}} xmm4 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX3-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX3-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX3-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX3-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX3-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX3-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX3-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX3-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX3-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm1[0] +; AVX3-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX3-NEXT: vpcmpeqb %xmm1, %xmm3, %xmm1 +; AVX3-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm0[0],xmm2[0] +; AVX3-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX3-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm0 +; AVX3-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX3-NEXT: vpmovb2m %zmm1, %k0 +; AVX3-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX3-NEXT: vpmovb2m %zmm0, %k1 +; AVX3-NEXT: kxnorw %k1, %k0, %k0 +; AVX3-NEXT: vpmovm2b %k0, %zmm0 +; AVX3-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX3-NEXT: vzeroupper +; AVX3-NEXT: retq + %wide.vec = load <64 x i8>, <64 x i8>* %ptr + %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> + %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> + %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> + %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> + + %cmp1 = icmp eq <16 x i8> %v1, %v2 + %cmp2 = icmp eq <16 x i8> %v3, %v4 + %res = icmp eq <16 x i1> %cmp1, %cmp2 + + ret <16 x i1> %res +} + +define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { +; AVX1-LABEL: interleaved_load_vf32_i8_stride4: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm8 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm2[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm5[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm8[0],xmm5[0] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm8 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm6[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm6 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm3[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [72340172838076673,72340172838076673] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: interleaved_load_vf32_i8_stride4: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX2-NEXT: vinserti128 $1, 112(%rdi), %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,2,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,2,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX3-LABEL: interleaved_load_vf32_i8_stride4: +; AVX3: # BB#0: +; AVX3-NEXT: vmovdqa (%rdi), %xmm0 +; AVX3-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX3-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX3-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX3-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX3-NEXT: vinserti128 $1, 80(%rdi), %ymm1, %ymm1 +; AVX3-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX3-NEXT: vinserti128 $1, 112(%rdi), %ymm3, %ymm3 +; AVX3-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX3-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX3-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX3-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX3-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX3-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX3-NEXT: vpermd %ymm1, %ymm4, %ymm5 +; AVX3-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX3-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX3-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX3-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX3-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX3-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX3-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX3-NEXT: vpermd %ymm3, %ymm4, %ymm1 +; AVX3-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX3-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX3-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX3-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX3-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX3-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] +; AVX3-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,2,1] +; AVX3-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,1,3] +; AVX3-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX3-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX3-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,1,3,3] +; AVX3-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX3-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,2,1] +; AVX3-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,1,3] +; AVX3-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX3-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX3-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX3-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX3-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2 +; AVX3-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX3-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX3-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 +; AVX3-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX3-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm0 +; AVX3-NEXT: vpsllw $7, %ymm2, %ymm1 +; AVX3-NEXT: vpmovb2m %zmm1, %k0 +; AVX3-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX3-NEXT: vpmovb2m %zmm0, %k1 +; AVX3-NEXT: kxnord %k1, %k0, %k0 +; AVX3-NEXT: vpmovm2b %k0, %zmm0 +; AVX3-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX3-NEXT: retq + %wide.vec = load <128 x i8>, <128 x i8>* %ptr + %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> + %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> + %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> + %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> + + %cmp1 = icmp eq <32 x i8> %v1, %v2 + %cmp2 = icmp eq <32 x i8> %v3, %v4 + %res = icmp eq <32 x i1> %cmp1, %cmp2 + + ret <32 x i1> %res +} Index: test/Transforms/InterleavedAccess/X86/interleaved-accesses-8bits.ll =================================================================== --- test/Transforms/InterleavedAccess/X86/interleaved-accesses-8bits.ll +++ test/Transforms/InterleavedAccess/X86/interleaved-accesses-8bits.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx -interleaved-access -S | FileCheck %s + +define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) { +; CHECK-LABEL: @interleaved_load_vf8_i8_stride4( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8>* [[PTR:%.*]] to <16 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[ADD1:%.*]] = add <8 x i8> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[ADD2:%.*]] = add <8 x i8> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[ADD3:%.*]] = mul <8 x i8> [[ADD1]], [[ADD2]] +; CHECK-NEXT: ret <8 x i8> [[ADD3]] +; + %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16 + %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> + %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> + %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> + %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> + + %add1 = add <8 x i8> %v1, %v2 + %add2 = add <8 x i8> %v4, %v3 + %add3 = mul <8 x i8> %add1, %add2 + ret <8 x i8> %add3 +} + +define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) { +; CHECK-LABEL: @interleaved_load_vf16_i8_stride4( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8>* [[PTR:%.*]] to <16 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq <16 x i8> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq <16 x i8> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[RES:%.*]] = icmp eq <16 x i1> [[CMP1]], [[CMP2]] +; CHECK-NEXT: ret <16 x i1> [[RES]] +; + %wide.vec = load <64 x i8>, <64 x i8>* %ptr + %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> + %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> + %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> + %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> + + %cmp1 = icmp eq <16 x i8> %v1, %v2 + %cmp2 = icmp eq <16 x i8> %v3, %v4 + %res = icmp eq <16 x i1> %cmp1, %cmp2 + + ret <16 x i1> %res +} + +define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { +; CHECK-LABEL: @interleaved_load_vf32_i8_stride4( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <128 x i8>* [[PTR:%.*]] to <16 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 5 +; CHECK-NEXT: [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 6 +; CHECK-NEXT: [[TMP15:%.*]] = load <16 x i8>, <16 x i8>* [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP17:%.*]] = load <16 x i8>, <16 x i8>* [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP11]], <32 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP13]], <32 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP15]], <32 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP17]], <32 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <32 x i8> [[TMP18]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP20]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP21]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <32 x i8> [[TMP22]], <32 x i8> [[TMP23]], <32 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <32 x i8> [[TMP22]], <32 x i8> [[TMP23]], <32 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <32 x i8> [[TMP24]], <32 x i8> [[TMP25]], <32 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <32 x i8> [[TMP24]], <32 x i8> [[TMP25]], <32 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <32 x i8> [[TMP26]], <32 x i8> [[TMP28]], <32 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <32 x i8> [[TMP26]], <32 x i8> [[TMP28]], <32 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <32 x i8> [[TMP27]], <32 x i8> [[TMP29]], <32 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <32 x i8> [[TMP27]], <32 x i8> [[TMP29]], <32 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <32 x i8> [[TMP30]], <32 x i8> [[TMP32]], <32 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <32 x i8> [[TMP30]], <32 x i8> [[TMP32]], <32 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <32 x i8> [[TMP31]], <32 x i8> [[TMP33]], <32 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <32 x i8> [[TMP31]], <32 x i8> [[TMP33]], <32 x i32> +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq <32 x i8> [[TMP34]], [[TMP35]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq <32 x i8> [[TMP36]], [[TMP37]] +; CHECK-NEXT: [[RES:%.*]] = icmp eq <32 x i1> [[CMP1]], [[CMP2]] +; CHECK-NEXT: ret <32 x i1> [[RES]] +; + %wide.vec = load <128 x i8>, <128 x i8>* %ptr + %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> + %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> + %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> + %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> + + %cmp1 = icmp eq <32 x i8> %v1, %v2 + %cmp2 = icmp eq <32 x i8> %v3, %v4 + %res = icmp eq <32 x i1> %cmp1, %cmp2 + + ret <32 x i1> %res +}