Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -568,6 +568,10 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) const; + /// Return true if target will expand (scalarize) the vector Opcode, and it + /// would be preferred to scalarized a memory access connected with it. + bool preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) const; + /// If target has efficient vector element load/store instructions, it can /// return true here so that insertion/extraction costs are not added to /// the scalarization cost of a load/store. @@ -1078,6 +1082,8 @@ getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) = 0; + virtual bool + preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual const MemCmpExpansionOptions *enableMemCmpExpansion( @@ -1346,6 +1352,11 @@ return Impl.getOperandsScalarizationOverhead(Args, VF); } + bool preferScalarizedMemFor(bool Stores, unsigned Opcode, + Type *VecTy) override { + return Impl.preferScalarizedMemFor(Stores, Opcode, VecTy); + } + bool supportsEfficientVectorElementLoadStore() override { return Impl.supportsEfficientVectorElementLoadStore(); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -303,6 +303,10 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) { return 0; } + bool preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) { + return false; + } + bool supportsEfficientVectorElementLoadStore() { return false; } bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -251,6 +251,11 @@ return TTIImpl->getOperandsScalarizationOverhead(Args, VF); } +bool TargetTransformInfo:: +preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) const { + return TTIImpl->preferScalarizedMemFor(Stores, Opcode, VecTy); +} + bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { return TTIImpl->supportsEfficientVectorElementLoadStore(); } Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -70,6 +70,7 @@ bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } + bool preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -347,6 +347,27 @@ return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); } +bool SystemZTTIImpl:: +preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) { + assert(VecTy->isVectorTy()); + if (Stores && getScalarSizeInBits(VecTy) != 64) + return false; + if (!Stores && VecTy->isFPOrFPVectorTy()) + return false; + + // It seems these opcodes translate to expanded vector DAG nodes here, but + // they are in fact not. + if ((Opcode == Instruction::Select) || (Opcode == Instruction::SExt) || + (Opcode == Instruction::ZExt) || (Opcode == Instruction::Trunc)) + return false; + const TargetLoweringBase *TLI = getTLI(); + int ISD = TLI->InstructionOpcodeToISD(Opcode); + if (!ISD) + return false; + std::pair LT = TLI->getTypeLegalizationCost(DL, VecTy); + return TLI->isOperationExpand(ISD, LT.second); +} + int SystemZTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1105,6 +1105,8 @@ return isScalarWithPredication(I); } + bool isScalarWithoutPredication(Instruction *I, unsigned VF); + /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); @@ -1166,14 +1168,16 @@ /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); + VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF, + bool InPassedVF = true); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); + unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy, + bool InPassedVF); /// Calculate vectorization cost of memory instruction \p I. - unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); + unsigned getMemoryInstructionCost(Instruction *I, unsigned VF, bool InPassedVF = true); /// The cost computation for scalarized memory instruction. unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); @@ -1263,6 +1267,9 @@ int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, unsigned VF); + int computeInstDiscount(Instruction *Inst, ScalarCostsTy &ScalarCosts, + unsigned VF); + /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in /// the vectorized loop corresponding to each vector iteration. Examples of @@ -4378,6 +4385,21 @@ return false; } +bool LoopVectorizationCostModel::isScalarWithoutPredication(Instruction *I, + unsigned VF) { + if (isa(I) || isa(I)) + return false; + + // A TTI query could be used to ask target if the instruction will be + // expanded during instruction selection, but it should be equivalent to + // find a vector cost greater than the sum of the scalar costs. + assert(VF > 1 && "Should only be called with VF > 1."); + unsigned ScalarCost = + getInstructionCost(I, 1, false/*InPassedVF*/).first * VF; + unsigned VectorCost = getInstructionCost(I, VF).first; + return (ScalarCost < VectorCost); +} + bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, unsigned VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); @@ -5253,6 +5275,21 @@ PredicatedBBsAfterVectorization.insert(BB); } } + + // Find sequences of non predicated instructions that have lower cost if + // scalarized. This would typically mean instructions that the target has + // no native vector support for. + for (BasicBlock *BB : TheLoop->blocks()) { + if (blockNeedsPredication(BB)) + continue; + for (Instruction &I : *BB) + if (!isScalarAfterVectorization(&I, VF) && !ScalarCostsVF.count(&I) && + isScalarWithoutPredication(&I, VF)) { + ScalarCostsTy ScalarCosts; + if (computeInstDiscount(&I, ScalarCosts, VF) >= 0) + ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + } + } } int LoopVectorizationCostModel::computePredInstDiscount( @@ -5370,6 +5407,71 @@ return Discount; } +int LoopVectorizationCostModel::computeInstDiscount( + Instruction *Inst, DenseMap &ScalarCosts, + unsigned VF) { + + // Initialize the discount to zero, meaning that the scalar version and the + // vector version cost the same. + int Discount = 0; + + auto needsExtract = [&](Instruction *I) -> bool { + if (isa(I)) + return getWideningDecision(I, VF) != CM_Scalarize; + return (TheLoop->contains(I) && !isScalarAfterVectorization(I, VF) && + !ScalarCosts.count(I)); + }; + + // Holds instructions to analyze. Those instructions are the ones that + // would be scalarized if we find that the scalar version costs less. + SmallVector Worklist; + Worklist.push_back(Inst); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + unsigned VectorCost = getInstructionCost(I, VF).first; + unsigned ScalarCost = + VF * getInstructionCost(I, 1, false/*InPassedVF*/).first; + + // Operands + for (Use &U : I->operands()) + if (auto *J = dyn_cast(U.get())) { + assert(VectorType::isValidElementType(J->getType()) && + "Instruction has non-scalar type"); + if (needsExtract(J)) + ScalarCost += TTI.getScalarizationOverhead( + ToVectorTy(J->getType(), VF), false, true /*Extract*/); + } + + // Users + SmallVector Tmp_worklist; + bool VectorUse = false; + for (User *U : I->users()) { + Instruction *UI = cast(U); + if (isScalarWithoutPredication(UI, VF)) + Tmp_worklist.push_back(UI); + else if (isa(UI) && + getWideningDecision(UI, VF) == CM_Scalarize) + continue; + else { + VectorUse = true; + break; + } + } + if (!VectorUse) + Worklist.append(Tmp_worklist.begin(), Tmp_worklist.end()); + else + ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), + true /*Insert*/, false); + + // Compute the discount. A non-negative discount means the vector version + // of the instruction costs more, and scalarizing would be beneficial. + Discount += VectorCost - ScalarCost; + ScalarCosts[I] = ScalarCost; + } + + return Discount; +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; @@ -5587,7 +5689,8 @@ } unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - unsigned VF) { + unsigned VF, + bool InPassedVF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. if (VF == 1) { @@ -5596,13 +5699,15 @@ unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, + (InPassedVF ? I : nullptr)); } return getWideningCost(I, VF); } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF, + bool InPassedVF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. if (isUniformAfterVectorization(I, VF)) @@ -5620,7 +5725,7 @@ } Type *VectorTy; - unsigned C = getInstructionCost(I, VF, VectorTy); + unsigned C = getInstructionCost(I, VF, VectorTy, InPassedVF); bool TypeNotScalarized = VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; @@ -5659,16 +5764,61 @@ continue; } - // We assume that widening is the best solution when possible. - if (memoryInstructionCanBeWidened(&I, VF)) { - unsigned Cost = getConsecutiveMemOpCost(&I, VF); + // Find cases where scalarization is better than widening + + // scalarization overhead. + unsigned const WidenCost = memoryInstructionCanBeWidened(&I, VF) + ? getConsecutiveMemOpCost(&I, VF) + : UINT_MAX; + bool LoadToScalarized = isa(&I); + if (LoadToScalarized) + for (User *U : I.users()) + if (!TTI.preferScalarizedMemFor(false/*Stores*/, + cast(U)->getOpcode(), + ToVectorTy(I.getType(), VF))) { + LoadToScalarized = false; + break; + } + bool StoreOfScalarized = false; + if (isa(&I)) + if (auto *OpAsIns = dyn_cast(I.getOperand(0))) + if (TTI.preferScalarizedMemFor(true/*Stores*/, + OpAsIns->getOpcode(), + ToVectorTy(OpAsIns->getType(), VF))) + StoreOfScalarized = true; + if (LoadToScalarized || StoreOfScalarized) { + // Assume that scalarization is best unless cost functions give a + // smaller sum for widening + overhead. + unsigned WidenPlusOverheadCost = WidenCost; + if (WidenCost < UINT_MAX) { + if (LoadToScalarized) { + WidenPlusOverheadCost += TTI.getScalarizationOverhead( + ToVectorTy(I.getType(), VF), false, true /*Extract*/); + } else if (StoreOfScalarized) { + WidenPlusOverheadCost += TTI.getScalarizationOverhead( + ToVectorTy(I.getOperand(0)->getType(), VF), true /*Insert*/, + false); + } + } + unsigned ScalarizationCost = + VF * getInstructionCost(&I, 1, false/*InPassedVF*/).first; + if (ScalarizationCost < WidenPlusOverheadCost) { + // The cost does not include scalarization overhead, since the + // user(s)/def will be scalarized as will be recognized in + // computeInstDiscount() later. + setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); + continue; + } + } + + if (WidenCost < UINT_MAX) { + // Assume now that widening is the best solution when possible. int ConsecutiveStride = - Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); + Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Expected consecutive stride."); InstWidening Decision = ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; - setWideningDecision(&I, VF, Decision, Cost); + setWideningDecision(&I, VF, Decision, WidenCost); continue; } @@ -5781,7 +5931,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF, - Type *&VectorTy) { + Type *&VectorTy, + bool InPassedVF) { Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); @@ -5915,7 +6066,8 @@ if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, + (InPassedVF ? I : nullptr)); } case Instruction::ICmp: case Instruction::FCmp: { @@ -5924,7 +6076,8 @@ if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, + (InPassedVF ? I : nullptr)); } case Instruction::Store: case Instruction::Load: { @@ -5937,7 +6090,7 @@ Width = 1; } VectorTy = ToVectorTy(getMemInstValueType(I), Width); - return getMemoryInstructionCost(I, VF); + return getMemoryInstructionCost(I, VF, InPassedVF); } case Instruction::ZExt: case Instruction::SExt: @@ -5983,7 +6136,8 @@ } unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; - return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); + return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, + (InPassedVF ? I : nullptr)); } case Instruction::Call: { bool NeedToScalarize;