diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -138,6 +138,15 @@ const Function *Callee, SmallPtrSetImpl &Args) const; bool hasActiveVectorLength(Type *DataType, Align Alignment) const; + InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + +private: + // The following constant is used for estimating costs on power9. + static const InstructionCost::CostType P9PipelineFlushEstimate = 80; + /// @} }; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1339,6 +1339,100 @@ } bool PPCTTIImpl::hasActiveVectorLength(Type *DataType, Align Alignment) const { - // TODO - return false; + // Loads/stores with length instructions use bits 0-7 of the GPR operand and + // therefore cannot be used in 32-bit mode. + if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64()) + return false; + if (auto *VecTy = dyn_cast(DataType)) { + unsigned VecWidth = DataType->getPrimitiveSizeInBits(); + return VecWidth == 128; + } + Type *ScalarTy = DataType->getScalarType(); + + if (ScalarTy->isPointerTy()) + return true; + + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return true; + + if (!ScalarTy->isIntegerTy()) + return false; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64; +} + +InstructionCost PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode, Type *Src, + Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I) { + InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment, + AddressSpace, CostKind, I); + if (TLI->getValueType(DL, Src, true) == MVT::Other) + return Cost; + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return Cost; + + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Invalid Opcode"); + bool IsLoad = (Opcode == Instruction::Load); + + auto *SrcVTy = dyn_cast(Src); + assert(SrcVTy && "Expected a vector type for VP memory operations"); + + if (hasActiveVectorLength(Src, Alignment)) { + std::pair LT = + TLI->getTypeLegalizationCost(DL, SrcVTy); + InstructionCost Cost = vectorCostAdjustment(LT.first, Opcode, Src, nullptr); + + // On P9 but not on P10, if the op is misaligned then it will cause a + // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked + // ones. + if (Alignment >= 16 || ST->getCPUDirective() != PPC::DIR_PWR9) + return Cost; + + // We assume the average case: that ops with alignment <= 128 + // will flush a full pipeline about half the time. + // The cost when this happens is about 80 cycles. + return P9PipelineFlushEstimate / 2; + } + + // Usually we should not get to this point, but the following is an attempt to + // model the cost of legalization. Currently we can only lower intrinsics with + // evl but no mask, on Power 9/10. Otherwise, we must scalarize. We need to + // extract (from the mask) the most/least significant byte of all halfwords + // aligned with vector elements, and do an access predicated on its 0th bit. + // We make the simplifying assumption that byte-extraction costs are + // stride-invariant, so we model the extraction as scalarizing a load of + // . + + // VSX masks have lanes per bit, but the predication is per halfword. + unsigned NumElems = SrcVTy->getNumElements(); + auto *MaskI8Ty = Type::getInt8Ty(SrcVTy->getContext()); + InstructionCost MaskSplitCost = getScalarizationOverhead( + FixedVectorType::get(MaskI8Ty, NumElems), false, true); + const InstructionCost ScalarCompareCostInstrCost = + getCmpSelInstrCost(Instruction::ICmp, MaskI8Ty, nullptr, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + + assert(ScalarCompareCostInstrCost.isValid() && + "Expected valid instruction cost"); + int ScalarCompareCost = *(ScalarCompareCostInstrCost.getValue()); + + const InstructionCost BranchInstrCost = + getCFInstrCost(Instruction::Br, CostKind); + assert(BranchInstrCost.isValid() && "Expected valid instruction cost"); + int BranchCost = *BranchInstrCost.getValue(); + int MaskCmpCost = NumElems * (BranchCost + ScalarCompareCost); + + InstructionCost ValueSplitCost = + getScalarizationOverhead(SrcVTy, IsLoad, !IsLoad); + const InstructionCost ScalarMemOpInstrCost = + NumElems * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace, CostKind); + assert(ScalarMemOpInstrCost.isValid() && "Expected valid instruction cost"); + int ScalarMemOpCost = *(ScalarMemOpInstrCost.getValue()); + return ScalarMemOpCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; }