Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1334,6 +1334,10 @@ return ScalarsPerVF->second.count(I); } +#ifndef NDEBUG + bool hasSingleCopyAfterVectorization(Instruction *I, ElementCount VF); +#endif + /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { @@ -7235,16 +7239,44 @@ } } +#ifndef NDEBUG +bool LoopVectorizationCostModel::hasSingleCopyAfterVectorization( + Instruction *I, ElementCount VF) { + if (VF.isVector()) { + auto Scalarized = InstsToScalarize.find(VF); + assert(Scalarized != InstsToScalarize.end() && + "VF not yet analyzed for scalarization profitability"); + return !Scalarized->second.count(I) && + llvm::all_of(I->users(), [&](User *U) { + auto *UI = cast(U); + return !Scalarized->second.count(UI); + }); + } + return true; +} +#endif + InstructionCost LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); - VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + if (isScalarAfterVectorization(I, VF)) { + VectorTy = RetTy; + // With the exception of GEPs, after scalarization there should only be one + // copy of the instruction generated in the loop. This is because the VF is + // either 1, or any instructions that need scalarizing have already been + // dealt with by the the time we get here. As a result, it means we don't + // have to multiply the instruction cost by VF. + assert(I->getOpcode() == Instruction::GetElementPtr || + hasSingleCopyAfterVectorization(I, VF)); + } else + VectorTy = ToVectorTy(RetTy, VF); + // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: @@ -7372,21 +7404,16 @@ Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; - return N * TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, - TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); + return TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, + Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { assert(!VF.isScalable() && "VF is assumed to be non scalable."); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; - return N * TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, - TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, - I->getOperand(0), I); + return TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None, I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -7510,14 +7537,7 @@ } } - unsigned N; - if (isScalarAfterVectorization(I, VF)) { - assert(!VF.isScalable() && "VF is assumed to be non scalable"); - N = VF.getKnownMinValue(); - } else - N = 1; - return N * - TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); + return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { bool NeedToScalarize; @@ -7534,8 +7554,8 @@ default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( - Instruction::Mul, VectorTy, CostKind) + + return VF.getFixedValue() * TTI.getArithmeticInstrCost(Instruction::Mul, + VectorTy, CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } Index: llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: all_scalar ; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 -; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 ; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions ; define void @all_scalar(i64* %a, i64 %n) {