Index: llvm/lib/Target/PowerPC/PPC.td =================================================================== --- llvm/lib/Target/PowerPC/PPC.td +++ llvm/lib/Target/PowerPC/PPC.td @@ -191,6 +191,13 @@ "Enable POWER9 vector instructions", [FeatureISA3_0, FeatureP8Vector, FeatureP9Altivec]>; +// A separate feature for this even though it is equivalent to P9Vector +// because this is a feature of the implementation rather than the architecture +// and may go away with future CPU's. +def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units", + "VectorsUseTwoUnits", + "true", + "Vectors use two units">; // Since new processors generally contain a superset of features of those that // came before them, the idea is to make implementations of new processors @@ -223,7 +230,8 @@ list Power8FeatureList = !listconcat(Power7FeatureList, Power8SpecificFeatures); list Power9SpecificFeatures = - [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0]; + [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0, + FeatureVectorsUseTwoUnits]; list Power9FeatureList = !listconcat(Power8FeatureList, Power9SpecificFeatures); } Index: llvm/lib/Target/PowerPC/PPCSubtarget.h =================================================================== --- llvm/lib/Target/PowerPC/PPCSubtarget.h +++ llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -136,6 +136,7 @@ bool IsISA3_0; bool UseLongCalls; bool SecurePlt; + bool VectorsUseTwoUnits; POPCNTDKind HasPOPCNTD; @@ -260,6 +261,7 @@ bool isPPC4xx() const { return IsPPC4xx; } bool isPPC6xx() const { return IsPPC6xx; } bool isSecurePlt() const {return SecurePlt; } + bool vectorsUseTwoUnits() const {return VectorsUseTwoUnits; } bool isE500() const { return IsE500; } bool isFeatureMFTB() const { return FeatureMFTB; } bool isDeprecatedDST() const { return DeprecatedDST; } Index: llvm/lib/Target/PowerPC/PPCSubtarget.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -108,6 +108,7 @@ IsISA3_0 = false; UseLongCalls = false; SecurePlt = false; + VectorsUseTwoUnits = false; HasPOPCNTD = POPCNTD_Unavailable; } Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -328,11 +328,16 @@ unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { - assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); + + if (Ty->isVectorTy() && ST->vectorsUseTwoUnits()) + Cost *= 2; - // Fallback to the default implementation. - return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); + return Cost; } int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, @@ -345,19 +350,48 @@ // instruction). We need one such shuffle instruction for each actual // register (this is not true for arbitrary shuffles, but is true for the // structured types of shuffles covered by TTI::ShuffleKind). - return LT.first; + int Cost = LT.first; + + if (ST->vectorsUseTwoUnits() && Tp->isVectorTy()) + Cost *= 2; + + return Cost; } int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I) { - assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src); + + if (Dst->isVectorTy() && ST->vectorsUseTwoUnits()) { + std::pair SrcLT = TLI->getTypeLegalizationCost(DL, Src); + std::pair DstLT = TLI->getTypeLegalizationCost(DL, Dst); - return BaseT::getCastInstrCost(Opcode, Dst, Src); + // The base class will call back if the vector is split. We only want + // to double the cost once, so avoid split cases. + if (SrcLT.first == 1 && SrcLT.second.isVector() && + DstLT.first == 1 && DstLT.second.isVector() && + !TLI->isOperationExpand(ISD, DstLT.second)) + Cost *= 2; + } + + return Cost; } int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I) { - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + + if (ValTy->isVectorTy() && ST->vectorsUseTwoUnits()) + Cost *= 2; + + return Cost; } int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { @@ -366,18 +400,24 @@ int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index); + + if (Val->isVectorTy() && ST->vectorsUseTwoUnits()) + Cost *= 2; + if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { - // Double-precision scalars are already located in index #0. - if (Index == 0) + if (ISD == ISD::EXTRACT_VECTOR_ELT && Index == ST->isLittleEndian() ? 1 : 0) + // Double-precision scalars are already located in index #0 (or #1 if LE). return 0; - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return Cost; + } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { // Floating point scalars are already located in index #0. if (Index == 0) return 0; - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return Cost; } // Estimated cost of a load-hit-store delay. This was obtained @@ -394,9 +434,9 @@ // these need to be estimated as very costly. if (ISD == ISD::EXTRACT_VECTOR_ELT || ISD == ISD::INSERT_VECTOR_ELT) - return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index); + return LHSPenalty + Cost; - return BaseT::getVectorInstrCost(Opcode, Val, Index); + return Cost; } int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, @@ -408,6 +448,9 @@ int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + if (Src->isVectorTy() && ST->vectorsUseTwoUnits()) + Cost *= 2; + bool IsAltivecType = ST->hasAltivec() && (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || LT.second == MVT::v4i32 || LT.second == MVT::v4f32); Index: llvm/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll @@ -0,0 +1,39 @@ +; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P9 +; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P8 + +%struct._pp = type { i16, i16, i16, i16 } + +; Function Attrs: norecurse nounwind readonly +define [5 x double] @foo(double %k, i64 %n, %struct._pp* nocapture readonly %p) local_unnamed_addr #0 { +entry: + %cmp17 = icmp sgt i64 %n, 0 + br i1 %cmp17, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + %retval.sroa.0.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %retval.sroa.4.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add10, %for.body ] + %.fca.0.insert = insertvalue [5 x double] undef, double %retval.sroa.0.0.lcssa, 0 + %.fca.1.insert = insertvalue [5 x double] %.fca.0.insert, double %retval.sroa.4.0.lcssa, 1 + ret [5 x double] %.fca.1.insert + +for.body: ; preds = %entry, %for.body + %i.020 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %retval.sroa.4.019 = phi double [ %add10, %for.body ], [ 0.000000e+00, %entry ] + %retval.sroa.0.018 = phi double [ %add, %for.body ], [ 0.000000e+00, %entry ] + %r1 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 2 + %0 = load i16, i16* %r1, align 2 + %conv2 = uitofp i16 %0 to double + %mul = fmul double %conv2, %k + %add = fadd double %retval.sroa.0.018, %mul + %g5 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 1 + %1 = load i16, i16* %g5, align 2 + %conv7 = uitofp i16 %1 to double + %mul8 = fmul double %conv7, %k + %add10 = fadd double %retval.sroa.4.019, %mul8 + %inc = add nuw nsw i64 %i.020, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-P8: load <2 x i16> +; CHECK-P9-NOT: load <2 x i16>