Index: llvm/trunk/lib/Target/PowerPC/PPC.td
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPC.td
+++ llvm/trunk/lib/Target/PowerPC/PPC.td
@@ -190,6 +190,13 @@
                                         "Enable POWER9 vector instructions",
                                         [FeatureISA3_0, FeatureP8Vector,
                                          FeatureP9Altivec]>;
+// A separate feature for this even though it is equivalent to P9Vector
+// because this is a feature of the implementation rather than the architecture
+// and may go away with future CPU's.
+def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
+                                                 "VectorsUseTwoUnits",
+                                                 "true",
+                                                 "Vectors use two units">;
 
 // Since new processors generally contain a superset of features of those that
 // came before them, the idea is to make implementations of new processors
@@ -222,7 +229,8 @@
   list<SubtargetFeature> Power8FeatureList =
       !listconcat(Power7FeatureList, Power8SpecificFeatures);
   list<SubtargetFeature> Power9SpecificFeatures =
-      [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0];
+      [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0,
+       FeatureVectorsUseTwoUnits];
   list<SubtargetFeature> Power9FeatureList =
       !listconcat(Power8FeatureList, Power9SpecificFeatures);
 }
Index: llvm/trunk/lib/Target/PowerPC/PPCSubtarget.h
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCSubtarget.h
+++ llvm/trunk/lib/Target/PowerPC/PPCSubtarget.h
@@ -135,6 +135,7 @@
   bool IsISA3_0;
   bool UseLongCalls;
   bool SecurePlt;
+  bool VectorsUseTwoUnits;
 
   POPCNTDKind HasPOPCNTD;
 
@@ -259,6 +260,7 @@
   bool isPPC4xx() const { return IsPPC4xx; }
   bool isPPC6xx() const { return IsPPC6xx; }
   bool isSecurePlt() const {return SecurePlt; }
+  bool vectorsUseTwoUnits() const {return VectorsUseTwoUnits; }
   bool isE500() const { return IsE500; }
   bool isFeatureMFTB() const { return FeatureMFTB; }
   bool isDeprecatedDST() const { return DeprecatedDST; }
Index: llvm/trunk/lib/Target/PowerPC/PPCSubtarget.cpp
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCSubtarget.cpp
+++ llvm/trunk/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -107,6 +107,7 @@
   IsISA3_0 = false;
   UseLongCalls = false;
   SecurePlt = false;
+  VectorsUseTwoUnits = false;
 
   HasPOPCNTD = POPCNTD_Unavailable;
 }
Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -70,6 +70,7 @@
   unsigned getCacheLineSize();
   unsigned getPrefetchDistance();
   unsigned getMaxInterleaveFactor(unsigned VF);
+  int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -323,6 +323,32 @@
   return 2;
 }
 
+// Adjust the cost of vector instructions on targets which there is overlap
+// between the vector and scalar units, thereby reducing the overall throughput
+// of vector code wrt. scalar code.
+int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1,
+                                     Type *Ty2) {
+  if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
+    return Cost;
+
+  std::pair<int, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
+  // If type legalization involves splitting the vector, we don't want to
+  // double the cost at every step - only the last step.
+  if (LT1.first != 1 || !LT1.second.isVector())
+    return Cost;
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  if (TLI->isOperationExpand(ISD, LT1.second))
+    return Cost;
+
+  if (Ty2) {
+    std::pair<int, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
+    if (LT2.first != 1 || !LT2.second.isVector())
+      return Cost;
+  }
+
+  return Cost * 2;
+}
+
 int PPCTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
@@ -330,8 +356,9 @@
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   // Fallback to the default implementation.
-  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
-                                       Opd1PropInfo, Opd2PropInfo);
+  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                           Opd1PropInfo, Opd2PropInfo);
+  return vectorCostAdjustment(Cost, Opcode, Ty, nullptr);
 }
 
 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
@@ -344,19 +371,22 @@
   // instruction). We need one such shuffle instruction for each actual
   // register (this is not true for arbitrary shuffles, but is true for the
   // structured types of shuffles covered by TTI::ShuffleKind).
-  return LT.first;
+  return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp,
+                              nullptr);
 }
 
 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                  const Instruction *I) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+  int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src);
+  return vectorCostAdjustment(Cost, Opcode, Dst, Src);
 }
 
 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                                    const Instruction *I) {
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+  int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+  return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr);
 }
 
 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -365,18 +395,22 @@
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
+  Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr);
+
   if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
-    // Double-precision scalars are already located in index #0.
-    if (Index == 0)
+    // Double-precision scalars are already located in index #0 (or #1 if LE).
+    if (ISD == ISD::EXTRACT_VECTOR_ELT && Index == ST->isLittleEndian() ? 1 : 0)
       return 0;
 
-    return BaseT::getVectorInstrCost(Opcode, Val, Index);
+    return Cost;
+
   } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
     // Floating point scalars are already located in index #0.
     if (Index == 0)
       return 0;
 
-    return BaseT::getVectorInstrCost(Opcode, Val, Index);
+    return Cost;
   }
 
   // Estimated cost of a load-hit-store delay.  This was obtained
@@ -393,9 +427,9 @@
   // these need to be estimated as very costly.
   if (ISD == ISD::EXTRACT_VECTOR_ELT ||
       ISD == ISD::INSERT_VECTOR_ELT)
-    return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
+    return LHSPenalty + Cost;
 
-  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+  return Cost;
 }
 
 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
@@ -406,6 +440,7 @@
          "Invalid Opcode");
 
   int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+  Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr);
 
   bool IsAltivecType = ST->hasAltivec() &&
                        (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
Index: llvm/trunk/test/Analysis/CostModel/PowerPC/p9.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/PowerPC/p9.ll
+++ llvm/trunk/test/Analysis/CostModel/PowerPC/p9.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-P9 %s
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-LE %s
+
+define void @testi16(i16 %arg1, i16 %arg2, i16* %arg3) {
+
+  %s1 = add i16 %arg1, %arg2
+  %s2 = zext i16 %arg1 to i32
+  %s3 = load i16, i16* %arg3
+  store i16 %arg2, i16* %arg3
+  %c = icmp eq i16 %arg1, %arg2
+
+  ret void
+  ; CHECK: cost of 1 {{.*}} add
+  ; CHECK: cost of 1 {{.*}} zext
+  ; CHECK: cost of 1 {{.*}} load
+  ; CHECK: cost of 1 {{.*}} store
+  ; CHECK: cost of 1 {{.*}} icmp
+  ; CHECK-P9: cost of 1 {{.*}} add
+  ; CHECK-P9: cost of 1 {{.*}} zext
+  ; CHECK-P9: cost of 1 {{.*}} load
+  ; CHECK-P9: cost of 1 {{.*}} store
+  ; CHECK-P9: cost of 1 {{.*}} icmp
+}
+
+define void @test4xi16(<4 x i16> %arg1, <4 x i16> %arg2) {
+
+  %v1 = add <4 x i16> %arg1, %arg2
+  %v2 = zext <4 x i16> %arg1 to <4 x i32>
+  %v3 = shufflevector <4 x i16> %arg1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %c = icmp eq <4 x i16> %arg1, %arg2
+
+  ret void
+  ; CHECK: cost of 1 {{.*}} add
+  ; CHECK: cost of 1 {{.*}} zext
+  ; CHECK: cost of 1 {{.*}} shufflevector
+  ; CHECK: cost of 1 {{.*}} icmp
+  ; CHECK-P9: cost of 2 {{.*}} add
+  ; CHECK-P9: cost of 2 {{.*}} zext
+  ; CHECK-P9: cost of 2 {{.*}} shufflevector
+  ; CHECK-P9: cost of 2 {{.*}} icmp
+}
+
+define void @test4xi32(<4 x i32> %arg1, <4 x i32> %arg2, <4 x i32>* %arg3) {
+
+  %v1 = load <4 x i32>, <4 x i32>* %arg3
+  store <4 x i32> %arg2, <4 x i32>* %arg3
+
+  ret void
+  ; CHECK: cost of 1 {{.*}} load
+  ; CHECK: cost of 1 {{.*}} store
+  ; CHECK-P9: cost of 2 {{.*}} load
+  ; CHECK-P9: cost of 2 {{.*}} store
+}
+
+define void @test2xdouble(<2 x double> %arg1) {
+  %v1 = extractelement <2 x double> %arg1, i32 0
+  %v2 = extractelement <2 x double> %arg1, i32 1
+
+  ret void
+  ; CHECK: cost of 0 {{.*}} extractelement
+  ; CHECK: cost of 1 {{.*}} extractelement
+  ; CHECK-P9: cost of 0 {{.*}} extractelement
+  ; CHECK-P9: cost of 2 {{.*}} extractelement
+  ; CHECK-LE-LABEL: test2xdouble
+  ; CHECK-LE: cost of 2 {{.*}} extractelement
+  ; CHECK-LE: cost of 0 {{.*}} extractelement
+}
Index: llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll
===================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll
+++ llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P9
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P8
+
+%struct._pp = type { i16, i16, i16, i16 }
+
+; Function Attrs: norecurse nounwind readonly
+define [5 x double] @foo(double %k, i64 %n, %struct._pp* nocapture readonly %p) local_unnamed_addr #0 {
+entry:
+  %cmp17 = icmp sgt i64 %n, 0
+  br i1 %cmp17, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %retval.sroa.0.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %retval.sroa.4.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add10, %for.body ]
+  %.fca.0.insert = insertvalue [5 x double] undef, double %retval.sroa.0.0.lcssa, 0
+  %.fca.1.insert = insertvalue [5 x double] %.fca.0.insert, double %retval.sroa.4.0.lcssa, 1
+  ret [5 x double] %.fca.1.insert
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.020 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %retval.sroa.4.019 = phi double [ %add10, %for.body ], [ 0.000000e+00, %entry ]
+  %retval.sroa.0.018 = phi double [ %add, %for.body ], [ 0.000000e+00, %entry ]
+  %r1 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 2
+  %0 = load i16, i16* %r1, align 2
+  %conv2 = uitofp i16 %0 to double
+  %mul = fmul double %conv2, %k
+  %add = fadd double %retval.sroa.0.018, %mul
+  %g5 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 1
+  %1 = load i16, i16* %g5, align 2
+  %conv7 = uitofp i16 %1 to double
+  %mul8 = fmul double %conv7, %k
+  %add10 = fadd double %retval.sroa.4.019, %mul8
+  %inc = add nuw nsw i64 %i.020, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-P8: load <2 x i16>
+; CHECK-P9-NOT: load <2 x i16>