Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2549,6 +2549,63 @@ return !(ST->isAtom()); } +// X86InterleavedPassCost - This function returns true if the +// X86InterleavedAccess pass supports the specific interleaved access group at +// hand or false otherwise. If it does, the function computes the \p 'Cost' of +// the optimized load/store+shuffle sequence that the X86InterleavedAccess pass +// will generate for this interleaved-access group. +// The \p 'Factor' contains the stride information of the interleaved. +static bool X86InterleavedPassCost(unsigned Opcode, Type *VecTy, + unsigned Factor, int &Cost) { + // The following types are supported by X86InterleavedAccess pass. + // The type 'v8i8' is blocked by the pass for stride 3. + std::vector supportType = {MVT::v8i8, MVT::v16i8, MVT::v32i8, + MVT::v64i8}; + // Currently the X86InterleavedAccess pass supports only char accesses. + if (VecTy->getScalarSizeInBits() != 8) + return false; + unsigned VF = VecTy->getVectorNumElements() / Factor; + MVT VT = MVT::getVectorVT(MVT::i8, VF); + auto NativeVT = std::find(supportType.begin(), supportType.end(), VT); + if (NativeVT == supportType.end()) + return false; + if ((Opcode == Instruction::Store && Factor == 4) || + (Factor == 3 && NativeVT[0] != MVT::v8i8)) { + // Base - contains the number of instructions for single lane. + unsigned Base = + Factor == 4 ? Factor * std::log2(Factor) : std::pow(Factor, 2); + unsigned Lanes = std::max(int(VT.getSizeInBits() / 128), 1); + + // The number of moves is equal to the 'Factor' (for load or store) + the + // number of move for rearranging the data inside the lanes. + unsigned Vmov = std::max(int(std::log2(Lanes)), 1) * Factor; + + // For Factor == 3 there is an extra move for the suffle instruction. + Vmov += Factor == 4 ? 0 : 1; + + // 'NumberOfCouple' contains the number of instructions for reconstruct the + // chosen register. The value is calculated according to the + // X86InterleavedAccess pass and it equal to the number of couples that + // built the chosen register from the basic lane size (128) up to the size + // of the chosen register. + // For example: The 512 bits register contains 4 lanes of 128. + // The 'numberOfCouples' to reconstruct this register equal to two cupples for 128 + // and 1 cupple of 256. Totle of 3 couples. + unsigned NumberOfCouples = 0; + unsigned TotleLanes = Lanes * Factor; + while (TotleLanes > Factor) { + TotleLanes = TotleLanes / 2; + NumberOfCouples += TotleLanes; + } + + Cost = Vmov + Base + NumberOfCouples; + Cost += (VF == 64 && Factor != 4 && Opcode == Instruction::Store) ? 4 : 0; + Cost -= (VF == 8 && Factor == 4) ? 1 : 0; + return true; + } + return false; +} + // Get estimation for interleaved load/store operations for AVX2. // \p Factor is the interleaved-access factor (stride) - number of // (interleaved) elements in the group. @@ -2566,6 +2623,9 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) { + int Cost; + if (X86InterleavedPassCost(Opcode, VecTy, Factor, Cost)) + return Cost; // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). @@ -2683,7 +2743,9 @@ LegalVT.getVectorNumElements()); unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); - + int Cost; + if (X86InterleavedPassCost(Opcode, VecTy, Factor, Cost)) + return Cost; if (Opcode == Instruction::Load) { // Kind of shuffle depends on number of loaded values. // If we load the entire data in one register, we can use a 1-src shuffle. @@ -2718,7 +2780,7 @@ if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; - int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + + Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + NumOfUnfoldedLoads * MemOpCost + NumOfMoves; return Cost; @@ -2738,7 +2800,7 @@ // The SK_MergeTwoSrc shuffle clobbers one of src operands. // We need additional instructions to keep sources. unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; - int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + + Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + NumOfMoves; return Cost; } Index: test/Analysis/CostModel/X86/interleaved-load-i8.ll =================================================================== --- test/Analysis/CostModel/X86/interleaved-load-i8.ll +++ test/Analysis/CostModel/X86/interleaved-load-i8.ll @@ -10,8 +10,8 @@ ;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 20 for VF 16 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 45 for VF 32 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: %0 = load i8 entry: %cmp13 = icmp sgt i32 %Nels, 0 br i1 %cmp13, label %for.body.preheader, label %for.end Index: test/Analysis/CostModel/X86/interleaved-store-i8.ll =================================================================== --- test/Analysis/CostModel/X86/interleaved-store-i8.ll +++ test/Analysis/CostModel/X86/interleaved-store-i8.ll @@ -10,8 +10,8 @@ ;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %conv4 ;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction: store i8 %conv4 ;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 19 for VF 16 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 35 for VF 32 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %conv4 +;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv4 entry: %cmp14 = icmp sgt i32 %Nels, 0 br i1 %cmp14, label %for.body.lr.ph, label %for.end @@ -47,9 +47,9 @@ ;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv7 ;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: store i8 %conv7 ;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 17 for VF 8 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 22 for VF 16 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 44 for VF 32 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 12 for VF 16 For instruction: store i8 %conv7 +;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv7 entry: %cmp19 = icmp sgt i32 %Nels, 0 br i1 %cmp19, label %for.body.lr.ph, label %for.end Index: test/Analysis/CostModel/X86/strided-load-i8.ll =================================================================== --- test/Analysis/CostModel/X86/strided-load-i8.ll +++ test/Analysis/CostModel/X86/strided-load-i8.ll @@ -41,9 +41,9 @@ ;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load ;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load ;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 8 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 20 for VF 32 For instruction: %1 = load -;CHECK: Found an estimated cost of 39 for VF 64 For instruction: %1 = load +;CHECK: Found an estimated cost of 13 for VF 16 For instruction: %1 = load +;CHECK: Found an estimated cost of 16 for VF 32 For instruction: %1 = load +;CHECK: Found an estimated cost of 28 for VF 64 For instruction: %1 = load entry: br label %for.body