Index: ../lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- ../lib/Transforms/Vectorize/LoopVectorize.cpp +++ ../lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1694,6 +1694,50 @@ /// during vectorization. bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction with consecutive memory access + /// that can be widened. + bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + + /// Decision that was taken during cost calculation for memory instruction. + enum InstWidening { + LV_NONE, + LV_WIDEN, + LV_INTERLEAVE, + LV_GATHER_SCATTER, + LV_SCALARIZE + }; + + typedef DenseMap, InstWidening> DecisionList; + + /// Save vectorization decision \p W taken by the cost model for + /// instruction \p I and vector width \p VF. + void setCostModelDecision(Instruction *I, unsigned VF, InstWidening W) { + assert(VF >= 2 && "Expected VF >=2"); + WideningDecisions[std::make_pair(I, VF)] = W; + } + + /// Save vectorization decision \p W taken by the cost model for + /// interleaving group \p Grp and vector width \p VF. + void setCostModelDecision(const InterleaveGroup *Grp, unsigned VF, InstWidening W) { + assert(VF >= 2 && "Expected VF >=2"); + /// Broadcast this decicion to all instructions inside the group. + for (unsigned i = 0; i < Grp->getFactor(); ++i) { + if (auto *I = Grp->getMember(i)) + WideningDecisions[std::make_pair(I, VF)] = W; + } + } + + /// Return the cost model decision for the given instruction \p I and vector + /// width \p VF. Return LV_NONE if this instruction did not pass through the cost + /// modeling. + InstWidening getCostModelDecision(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); + if (!WideningDecisions.count(InstOnVF)) + return LV_NONE; + return WideningDecisions[InstOnVF]; + } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -1835,6 +1879,10 @@ /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; + + /// Keeps cost model decisions for instructions. + /// Right now it is used for memory instructions only. + DecisionList WideningDecisions; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1951,6 +1999,12 @@ /// the vector type as an output parameter. unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + /// The cost computation for scalarized memory instruction. + unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); + + /// The cost computation for interleaving group of memory instructions. + unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -2818,10 +2872,22 @@ assert((LI || SI) && "Invalid Load/Store instruction"); + // The vectorization decision taken during the cost computation. + LoopVectorizationLegality::InstWidening Decision = + Legal->getCostModelDecision(Instr, VF); + bool NoCostModelDecision = (Decision == LoopVectorizationLegality::LV_NONE); + // Try to vectorize the interleave group if this access is interleaved. - if (Legal->isAccessInterleaved(Instr)) + if (Decision == LoopVectorizationLegality::LV_INTERLEAVE || + (NoCostModelDecision && Legal->isAccessInterleaved(Instr))) return vectorizeInterleaveGroup(Instr); + // Scalarize the memory instruction if necessary. + if (Decision == LoopVectorizationLegality::LV_SCALARIZE || + (NoCostModelDecision && + Legal->memoryInstructionMustBeScalarized(Instr, VF))) + return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); + Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = getPointerOperand(Instr); @@ -2833,18 +2899,14 @@ Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - // Scalarize the memory instruction if necessary. - if (Legal->memoryInstructionMustBeScalarized(Instr, VF)) - return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); - // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. bool CreateGatherScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr); + (Decision == LoopVectorizationLegality::LV_GATHER_SCATTER || + (NoCostModelDecision && !ConsecutiveStride && + Legal->isLegalGatherOrScatter(Instr))); VectorParts VectorGep; @@ -5487,14 +5549,8 @@ return false; } -bool LoopVectorizationLegality::memoryInstructionMustBeScalarized( - Instruction *I, unsigned VF) { - - // If the memory instruction is in an interleaved group, it will be - // vectorized and its pointer will remain uniform. - if (isAccessInterleaved(I)) - return false; - +bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, + unsigned VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast(I); StoreInst *SI = dyn_cast(I); @@ -5504,28 +5560,35 @@ // will be scalarized. auto *Ptr = getPointerOperand(I); if (LI && isUniform(Ptr)) - return true; + return false; - // If the pointer operand is non-consecutive and neither a gather nor a - // scatter operation is legal, the memory instruction will be scalarized. - if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I)) - return true; + if (!isConsecutivePtr(Ptr)) + return false; // If the instruction is a store located in a predicated block, it will be // scalarized. if (isScalarWithPredication(I)) - return true; + return false; // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. auto &DL = I->getModule()->getDataLayout(); auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); if (hasIrregularType(ScalarTy, DL, VF)) - return true; + return false; - // Otherwise, the memory instruction should be vectorized if the rest of the - // loop is. - return false; + return true; +} + +bool LoopVectorizationLegality::memoryInstructionMustBeScalarized( + Instruction *I, unsigned VF) { + + // If the memory instruction is in an interleaved group, it will be + // vectorized and its pointer will remain uniform. + if (isAccessInterleaved(I) || isLegalGatherOrScatter(I)) + return false; + + return !memoryInstructionCanBeWidened(I, VF); } void LoopVectorizationLegality::collectLoopUniforms() { @@ -6843,6 +6906,79 @@ Legal->hasStride(I->getOperand(1)); } +unsigned +LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, + unsigned VF) { + StoreInst *SI = dyn_cast(I); + LoadInst *LI = dyn_cast(I); + Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); + auto SE = PSE.getSE(); + + unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); + unsigned AS = + SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); + Value *Ptr = getPointerOperand(I); + Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + + // True if the memory instruction's address computation is complex. + bool IsComplexComputation = + isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); + + // Get the cost of the scalar memory instruction and address computation. + unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, + IsComplexComputation); + Cost += VF * + TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), + Alignment, AS); + + // Get the overhead of the extractelement and insertelement instructions + // we might create due to scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // If we have a predicated store, it may not be executed for each vector + // lane. Scale the cost by the probability of executing the predicated + // block. + if (Legal->isScalarWithPredication(I)) + Cost /= getReciprocalPredBlockProb(); + + return Cost; +} + +unsigned +LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, unsigned VF) { + StoreInst *SI = dyn_cast(I); + LoadInst *LI = dyn_cast(I); + Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType()); + Type *VectorTy = ToVectorTy(ValTy, VF); + unsigned AS = + SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); + + auto Group = Legal->getInterleavedAccessGroup(I); + assert(Group && "Fail to get an interleaved access group."); + + unsigned InterleaveFactor = Group->getFactor(); + Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + + // Holds the indices of existing members in an interleaved load group. + // An interleaved store group doesn't need this as it doesn't allow gaps. + SmallVector Indices; + if (LI) { + for (unsigned i = 0; i < InterleaveFactor; i++) + if (Group->getMember(i)) + Indices.push_back(i); + } + + // Calculate the cost of the whole interleaved group. + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, Group->getFactor(), Indices, + Group->getAlignment(), AS); + + if (Group->isReverse()) + Cost += Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of @@ -7011,107 +7147,100 @@ if (LI && Legal->isUniform(Ptr)) { // Scalar load + broadcast + Legal->setCostModelDecision(I, VF, + LoopVectorizationLegality::LV_SCALARIZE); unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType()); Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); return Cost + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy); + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } - // For an interleaved access, calculate the total cost of the whole - // interleave group. - if (Legal->isAccessInterleaved(I)) { - auto Group = Legal->getInterleavedAccessGroup(I); - assert(Group && "Fail to get an interleaved access group."); + // We assume that widening is the best solution when possible. + if (Legal->memoryInstructionCanBeWidened(I, VF)) { + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + assert(ConsecutiveStride != 0 && + "Can't widen non-consecutive memory instruction"); + bool Reverse = ConsecutiveStride < 0; - // Only calculate the cost once at the insert position. - if (Group->getInsertPos() != I) - return 0; - - unsigned InterleaveFactor = Group->getFactor(); - Type *WideVecTy = - VectorType::get(VectorTy->getVectorElementType(), - VectorTy->getVectorNumElements() * InterleaveFactor); - - // Holds the indices of existing members in an interleaved load group. - // An interleaved store group doesn't need this as it doesn't allow gaps. - SmallVector Indices; - if (LI) { - for (unsigned i = 0; i < InterleaveFactor; i++) - if (Group->getMember(i)) - Indices.push_back(i); - } - - // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS); - - if (Group->isReverse()) + unsigned Cost = TTI.getAddressComputationCost(VectorTy); + if (Legal->isMaskRequired(I)) Cost += - Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + else + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - // FIXME: The interleaved load group with a huge gap could be even more - // expensive than scalar operations. Then we could ignore such group and - // use scalar operations instead. + if (Reverse) + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, + 0); + Legal->setCostModelDecision(I, VF, LoopVectorizationLegality::LV_WIDEN); return Cost; } - // Check if the memory instruction will be scalarized. - if (Legal->memoryInstructionMustBeScalarized(I, VF)) { - unsigned Cost = 0; - Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - - // True if the memory instruction's address computation is complex. - bool IsComplexComputation = - isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); - - // Get the cost of the scalar memory instruction and address computation. - Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); - Cost += VF * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment, AS); - - // Get the overhead of the extractelement and insertelement instructions - // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); - - // If we have a predicated store, it may not be executed for each vector - // lane. Scale the cost by the probability of executing the predicated - // block. - if (Legal->isScalarWithPredication(I)) - Cost /= getReciprocalPredBlockProb(); - - return Cost; - } + // For an interleaved access, calculate the total cost of the whole + // interleave group. + unsigned InterleaveGrpCost = 0; + unsigned InterleaveInstCost = (unsigned)(-1); + unsigned GatherScatterCost = (unsigned)(-1); + if (Legal->isAccessInterleaved(I)) { + auto Group = Legal->getInterleavedAccessGroup(I); + assert(Group && "Fail to get an interleaved access group."); - // Determine if the pointer operand of the access is either consecutive or - // reverse consecutive. - int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); - bool Reverse = ConsecutiveStride < 0; - - // Determine if either a gather or scatter operation is legal. - bool UseGatherOrScatter = - !ConsecutiveStride && Legal->isLegalGatherOrScatter(I); - - unsigned Cost = TTI.getAddressComputationCost(VectorTy); - if (UseGatherOrScatter) { - assert(ConsecutiveStride == 0 && - "Gather/Scatter are not used for consecutive stride"); - return Cost + - TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), Alignment); + LoopVectorizationLegality::InstWidening Decision = + Legal->getCostModelDecision(I, VF); + if (Decision == LoopVectorizationLegality::LV_NONE) { + // Only calculate the cost once for the group. + InterleaveGrpCost = getInterleaveGroupCost(I, VF); + + if (Group->getNumMembers() > 1) + DEBUG(dbgs() << "LV: Found an estimated cost of " << + InterleaveGrpCost << " for VF " << + VF << " For interleaving group: " << *I << '\n'); + + if (InterleaveGrpCost / Group->getNumMembers() <= 1) { + // The interleaving cost is good enough. + Legal->setCostModelDecision(Group, VF, + LoopVectorizationLegality::LV_INTERLEAVE); + return InterleaveGrpCost; + } + // We are going to check all other options. + InterleaveInstCost = InterleaveGrpCost / Group->getNumMembers(); + } else if (Decision == LoopVectorizationLegality::LV_INTERLEAVE) + // This instructions belongs to interleaving group and will be + // vectorized inside this group. + return 0; + // If we are here it means that the instruction belongs to an + // interleaving group, but interleaving is not profitable for the current + // VF. We'll need to choose another option. + } + + if (Legal->isLegalGatherOrScatter(I)) { + GatherScatterCost = TTI.getAddressComputationCost(VectorTy) + + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment); + } + unsigned ScalarizationCost = getMemInstScalarizationCost(I, VF); + + // Choose better solution for the current VF and return the cost. + // Write down this decision and use it during vectorization. + unsigned Cost; + LoopVectorizationLegality::InstWidening Decision; + if (InterleaveInstCost <= GatherScatterCost && + InterleaveInstCost <= ScalarizationCost) { + Decision = LoopVectorizationLegality::LV_INTERLEAVE; + Cost = InterleaveGrpCost; + } else if (GatherScatterCost < InterleaveInstCost && + GatherScatterCost < ScalarizationCost) { + Decision = LoopVectorizationLegality::LV_GATHER_SCATTER; + Cost = GatherScatterCost; + } else { + Cost = ScalarizationCost; + Decision = LoopVectorizationLegality::LV_SCALARIZE; } - // Wide load/stores. - if (Legal->isMaskRequired(I)) - Cost += - TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + if (auto Group = Legal->getInterleavedAccessGroup(I)) + Legal->setCostModelDecision(Group, VF, Decision); else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - - if (Reverse) - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + Legal->setCostModelDecision(I, VF, Decision); return Cost; } case Instruction::ZExt: Index: ../test/Analysis/CostModel/X86/interleave-load-i32.ll =================================================================== --- ../test/Analysis/CostModel/X86/interleave-load-i32.ll +++ ../test/Analysis/CostModel/X86/interleave-load-i32.ll @@ -10,11 +10,10 @@ ; Function Attrs: nounwind uwtable define void @load_i32_interleave4() { ;CHECK-LABEL: load_i32_interleave4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load -;CHECK: Found an estimated cost of 5 for VF 2 For instruction: %0 = load -;CHECK: Found an estimated cost of 5 for VF 4 For instruction: %0 = load -;CHECK: Found an estimated cost of 8 for VF 8 For instruction: %0 = load -;CHECK: Found an estimated cost of 22 for VF 16 For instruction: %0 = load +;CHECK: Found an estimated cost of 5 for VF 2 For interleaving group: {{.*}} = load +;CHECK: Found an estimated cost of 5 for VF 4 For interleaving group: {{.*}} = load +;CHECK: Found an estimated cost of 8 for VF 8 For interleaving group: {{.*}} = load +;CHECK: Found an estimated cost of 22 for VF 16 For interleaving group: {{.*}} = load entry: br label %for.body @@ -46,11 +45,10 @@ define void @load_i32_interleave5() { ;CHECK-LABEL: load_i32_interleave5 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load -;CHECK: Found an estimated cost of 6 for VF 2 For instruction: %0 = load -;CHECK: Found an estimated cost of 9 for VF 4 For instruction: %0 = load -;CHECK: Found an estimated cost of 18 for VF 8 For instruction: %0 = load -;CHECK: Found an estimated cost of 35 for VF 16 For instruction: %0 = load +;CHECK: Found an estimated cost of 6 for VF 2 For interleaving group: {{.*}} = load +;CHECK: Found an estimated cost of 9 for VF 4 For interleaving group: {{.*}} = load +;CHECK: Found an estimated cost of 18 for VF 8 For interleaving group: {{.*}} = load +;CHECK: Found an estimated cost of 35 for VF 16 For interleaving group: {{.*}} = load entry: br label %for.body Index: ../test/Analysis/CostModel/X86/interleave-store-i32.ll =================================================================== --- ../test/Analysis/CostModel/X86/interleave-store-i32.ll +++ ../test/Analysis/CostModel/X86/interleave-store-i32.ll @@ -10,11 +10,10 @@ ; Function Attrs: nounwind uwtable define void @store_i32_interleave4() { ;CHECK-LABEL: store_i32_interleave4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add16 -;CHECK: Found an estimated cost of 5 for VF 2 For instruction: store i32 %add16 -;CHECK: Found an estimated cost of 5 for VF 4 For instruction: store i32 %add16 -;CHECK: Found an estimated cost of 11 for VF 8 For instruction: store i32 %add16 -;CHECK: Found an estimated cost of 22 for VF 16 For instruction: store i32 %add16 +;CHECK: Found an estimated cost of 5 for VF 2 For interleaving group: store +;CHECK: Found an estimated cost of 5 for VF 4 For interleaving group: store +;CHECK: Found an estimated cost of 11 for VF 8 For interleaving group: store +;CHECK: Found an estimated cost of 22 for VF 16 For interleaving group: store entry: br label %for.body @@ -46,11 +45,10 @@ define void @store_i32_interleave5() { ;CHECK-LABEL: store_i32_interleave5 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add22 -;CHECK: Found an estimated cost of 7 for VF 2 For instruction: store i32 %add22 -;CHECK: Found an estimated cost of 14 for VF 4 For instruction: store i32 %add22 -;CHECK: Found an estimated cost of 21 for VF 8 For instruction: store i32 %add22 -;CHECK: Found an estimated cost of 35 for VF 16 For instruction: store i32 %add22 +;CHECK: Found an estimated cost of 7 for VF 2 For interleaving group: store +;CHECK: Found an estimated cost of 14 for VF 4 For interleaving group: store +;CHECK: Found an estimated cost of 21 for VF 8 For interleaving group: store +;CHECK: Found an estimated cost of 35 for VF 16 For interleaving group: store entry: br label %for.body Index: ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll =================================================================== --- ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll +++ ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll @@ -0,0 +1,41 @@ +; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This test checks that "gather" operation is choosen since it's cost is better +; than interleaving pattern. +; +;unsigned long A[SIZE]; +;unsigned long B[SIZE]; +; +;void foo() { +; for (int i=0; i:1: ; preds = %0, %1 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv + %3 = load i64, i64* %2, align 16 + %4 = add i64 %3, 5 + %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %4, i64* %5, align 16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8 + %6 = icmp slt i64 %indvars.iv.next, 1024 + br i1 %6, label %1, label %7 + +;