Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2549,6 +2549,45 @@ return !(ST->isAtom()); } +// OptimizeInterleavedCost - This function returns a true if the Interleaved +// pass supports the interleaved or false otherwise. If the interleaved is +// supported by the pass, the function computes the real cost of the interleaved +// inside the 'Cost' argument. The 'Factor' argument contains the stride +// information of the interleaved. +static bool optimizeInterleavedCost(unsigned Opcode, Type *VecTy, + unsigned Factor, int &Cost) { + // The following types are supported by X86InterleavedAccess pass. The v8i8 + // is blocked by the pass for stride 3. + std::vector supportType = {MVT::v8i8, MVT::v16i8, MVT::v32i8, + MVT::v64i8}; + if (VecTy->getScalarSizeInBits() != 8) + return false; + unsigned VF = VecTy->getVectorNumElements() / Factor; + MVT VT = MVT::getVectorVT(MVT::i8, VF); + auto NativeVT = std::find(supportType.begin(), supportType.end(), VT); + if (NativeVT == supportType.end()) + return false; + if (Opcode == Instruction::Store && Factor == 4) { + unsigned Vinsert = VF > 8 ? 3 * (VF / 16) : 1; + unsigned Vmov = VF > 8 ? VF / 16 : 1; + unsigned Extra = VF == 8 ? 1 : 0; + unsigned Vpunpck = 8; + unsigned Vextract = VF == 64 ? 4 : 0; + Cost = Vmov + Vpunpck + Vinsert + Vextract + Extra; + return true; + } + if (Factor == 3 && NativeVT[0] != MVT::v8i8) { + unsigned Vmov = VF == 64 ? 6 : 3 + 1; + unsigned Vpshufb = 3; + unsigned Vpalignr = 6; + unsigned Vinsert = VF == 16 ? 0 : std::pow(3, VF / 32); + unsigned Vextract = VF == 64 ? 4 : 0; + Cost = Vmov + Vpshufb + Vinsert + Vpalignr + Vextract; + return true; + } + return false; +} + // Get estimation for interleaved load/store operations for AVX2. // \p Factor is the interleaved-access factor (stride) - number of // (interleaved) elements in the group. @@ -2566,6 +2605,9 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) { + int Cost; + if (optimizeInterleavedCost(Opcode, VecTy, Factor, Cost)) + return Cost; // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). @@ -2683,7 +2725,9 @@ LegalVT.getVectorNumElements()); unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); - + int Cost; + if (optimizeInterleavedCost(Opcode, VecTy, Factor, Cost)) + return Cost; if (Opcode == Instruction::Load) { // Kind of shuffle depends on number of loaded values. // If we load the entire data in one register, we can use a 1-src shuffle. @@ -2718,7 +2762,7 @@ if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; - int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + + Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + NumOfUnfoldedLoads * MemOpCost + NumOfMoves; return Cost; @@ -2732,13 +2776,13 @@ // shuffle. unsigned NumOfSources = Factor; // The number of values to be merged. unsigned ShuffleCost = - getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); unsigned NumOfShufflesPerStore = NumOfSources - 1; // The SK_MergeTwoSrc shuffle clobbers one of src operands. // We need additional instructions to keep sources. unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; - int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + + Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + NumOfMoves; return Cost; } Index: test/Analysis/CostModel/X86/interleaved-load-i8.ll =================================================================== --- test/Analysis/CostModel/X86/interleaved-load-i8.ll +++ test/Analysis/CostModel/X86/interleaved-load-i8.ll @@ -10,8 +10,8 @@ ;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 20 for VF 16 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 45 for VF 32 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: %0 = load i8 entry: %cmp13 = icmp sgt i32 %Nels, 0 br i1 %cmp13, label %for.body.preheader, label %for.end Index: test/Analysis/CostModel/X86/interleaved-store-i8.ll =================================================================== --- test/Analysis/CostModel/X86/interleaved-store-i8.ll +++ test/Analysis/CostModel/X86/interleaved-store-i8.ll @@ -10,8 +10,8 @@ ;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %conv4 ;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction: store i8 %conv4 ;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 19 for VF 16 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 35 for VF 32 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv4 entry: %cmp14 = icmp sgt i32 %Nels, 0 br i1 %cmp14, label %for.body.lr.ph, label %for.end @@ -47,9 +47,9 @@ ;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv7 ;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: store i8 %conv7 ;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 17 for VF 8 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 22 for VF 16 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 44 for VF 32 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 12 for VF 16 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv7 entry: %cmp19 = icmp sgt i32 %Nels, 0 br i1 %cmp19, label %for.body.lr.ph, label %for.end Index: test/Analysis/CostModel/X86/strided-load-i8.ll =================================================================== --- test/Analysis/CostModel/X86/strided-load-i8.ll +++ test/Analysis/CostModel/X86/strided-load-i8.ll @@ -41,9 +41,9 @@ ;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load ;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load ;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 8 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 20 for VF 32 For instruction: %1 = load -;CHECK: Found an estimated cost of 39 for VF 64 For instruction: %1 = load +;CHECK: Found an estimated cost of 13 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 16 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 28 for VF 64 For instruction: %1 = load entry: br label %for.body