diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -139,6 +139,14 @@ bool areFunctionArgsABICompatible(const Function *Caller, const Function *Callee, SmallPtrSetImpl &Args) const; + InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); +private: + // The following constant is used for estimating costs on power9. + static const InstructionCost::CostType P9PipelineFlushEstimate = 80; + /// @} }; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1337,3 +1337,80 @@ return false; } + +InstructionCost PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode, Type *Src, + Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I) { + InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment, + AddressSpace, CostKind, I); + if (TLI->getValueType(DL, Src, true) == MVT::Other) + return Cost; + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return Cost; + + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Invalid Opcode"); + bool IsLoad = (Opcode == Instruction::Load); + + auto *SrcVTy = dyn_cast(Src); + assert(SrcVTy && "Expected a vector type for VP memory operations"); + + // VSX masks have lanes per bit, but the predication is per halfword. + unsigned NumElems = SrcVTy->getNumElements(); + auto *SrcScalarTy = SrcVTy->getScalarType(); + auto *MaskI8Ty = Type::getInt8Ty(SrcVTy->getContext()); + auto *MaskTy = FixedVectorType::get(MaskI8Ty, NumElems); + + if (!getVPLegalizationStrategy(*dyn_cast(I)).shouldDoNothing()) { + // Currently we can only lower intrinsics with evl but no mask, on Power + // 9/10. Otherwise, we must scalarize. We need to extract (from the mask) + // the most/least significant byte of all halfwords aligned with vector + // elements, and do an access predicated on its 0th bit. We make the + // simplifying assumption that byte-extraction costs are stride-invariant, + // so we model the extraction as scalarizing a load of . + InstructionCost MaskSplitCost = + getScalarizationOverhead(MaskTy, false, true); + const InstructionCost ScalarCompareCostInstrCost = + getCmpSelInstrCost(Instruction::ICmp, MaskI8Ty, nullptr, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + + assert(ScalarCompareCostInstrCost.isValid() && + "Expected valid instruction cost"); + int ScalarCompareCost = *(ScalarCompareCostInstrCost.getValue()); + + const InstructionCost BranchInstrCost = + getCFInstrCost(Instruction::Br, CostKind); + assert(BranchInstrCost.isValid() && "Expected valid instruction cost"); + int BranchCost = *BranchInstrCost.getValue(); + int MaskCmpCost = NumElems * (BranchCost + ScalarCompareCost); + + InstructionCost ValueSplitCost = + getScalarizationOverhead(SrcVTy, IsLoad, !IsLoad); + const InstructionCost ScalarMemOpInstrCost = + NumElems * BaseT::getMemoryOpCost(Opcode, SrcScalarTy, Alignment, + AddressSpace, CostKind); + assert(ScalarMemOpInstrCost.isValid() && "Expected valid instruction cost"); + int ScalarMemOpCost = *(ScalarMemOpInstrCost.getValue()); + return ScalarMemOpCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; + } + + std::pair LT = TLI->getTypeLegalizationCost(DL, SrcVTy); + + if (Alignment >= 16) + // If the op is guaranteed to be aligned to 128 bytes, + // then VSX masked memops cost the same as unmasked memops. + return LT.first; + + // On P9 but not on P10, if the op is misaligned + // then it will cause a pipeline flush. + // We assume the average case: that ops with alignment <= 128 + // will flush a full pipeline about half the time. + // The cost when this happens is about 80 cycles. + if (ST->getCPUDirective() == PPC::DIR_PWR9) + return P9PipelineFlushEstimate / 2; + + return LT.first; +}