Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -559,6 +559,7 @@
     if (Subtarget.hasVSX()) {
       setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
       if (Subtarget.hasP8Vector()) {
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
@@ -572,6 +573,7 @@
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+        setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
       }
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
 
Index: lib/Target/PowerPC/PPCInstrVSX.td
===================================================================
--- lib/Target/PowerPC/PPCInstrVSX.td
+++ lib/Target/PowerPC/PPCInstrVSX.td
@@ -850,6 +850,10 @@
           (f64 (EXTRACT_SUBREG $S, sub_64))>;
 def : Pat<(f64 (extractelt v2f64:$S, 1)),
           (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 0)),
+          (v2f64 (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), $A, 1))>;
+def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 1)),
+          (v2f64 (XXPERMDI $A, (COPY_TO_REGCLASS $B, VSRC), 0))>;
 }
 
 let Predicates = [IsLittleEndian] in {
@@ -861,6 +865,10 @@
           (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
 def : Pat<(f64 (extractelt v2f64:$S, 1)),
           (f64 (EXTRACT_SUBREG $S, sub_64))>;
+def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 0)),
+          (v2f64 (XXPERMDI $A, (COPY_TO_REGCLASS $B, VSRC), 0))>;
+def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 1)),
+          (v2f64 (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), $A, 1))>;
 }
 
 // Additional fnmsub patterns: -a*c + b == -(a*c - b)
@@ -1657,6 +1665,12 @@
             (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
   def : Pat<(v2i64 (scalar_to_vector i64:$A)),
             (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
+  def : Pat<(v2i64 (insertelt v2i64:$V, i64:$A, 0)),
+            (v2i64 (XXPERMDI
+                     (COPY_TO_REGCLASS MovesToVSR.BE_DWORD_0, VSRC), $V, 1))>;
+  def : Pat<(v2i64 (insertelt v2i64:$V, i64:$A, 1)),
+            (v2i64 (XXPERMDI
+                     $V, (COPY_TO_REGCLASS MovesToVSR.BE_DWORD_0, VSRC), 0))>;
   def : Pat<(i32 (vector_extract v16i8:$S, 0)),
             (i32 VectorExtractions.LE_BYTE_15)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 1)),
@@ -1764,6 +1778,12 @@
             (v4i32 MovesToVSR.LE_WORD_0)>;
   def : Pat<(v2i64 (scalar_to_vector i64:$A)),
             (v2i64 MovesToVSR.LE_DWORD_0)>;
+  def : Pat<(v2i64 (insertelt v2i64:$V, i64:$A, 0)),
+            (v2i64 (XXPERMDI
+                     $V, (COPY_TO_REGCLASS MovesToVSR.BE_DWORD_0, VSRC), 0))>;
+  def : Pat<(v2i64 (insertelt v2i64:$V, i64:$A, 1)),
+            (v2i64 (XXPERMDI
+                     (COPY_TO_REGCLASS MovesToVSR.BE_DWORD_0, VSRC), $V, 1))>;
   def : Pat<(i32 (vector_extract v16i8:$S, 0)),
             (i32 VectorExtractions.LE_BYTE_0)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 1)),
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -315,16 +315,11 @@
 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
+  int DirectMoveCost = 5;
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
-    // Double-precision scalars are already located in index #0.
-    if (Index == 0)
-      return 0;
-
-    return BaseT::getVectorInstrCost(Opcode, Val, Index);
-  } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
+  if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
     // Floating point scalars are already located in index #0.
     if (Index == 0)
       return 0;
@@ -332,6 +327,75 @@
     return BaseT::getVectorInstrCost(Opcode, Val, Index);
   }
 
+  // Handle vector insertions and extractions differently because extractions
+  // are inherrently more efficient on Power hardware prior to Power9.
+  if (ISD == ISD::EXTRACT_VECTOR_ELT) {
+    // Doubles are already at index 1/0 (LE/BE)
+    if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
+      if (Index == ST->isLittleEndian() ? 1 : 0)
+        return 0;
+      else // Otherwise it needs a swap
+        return 1;
+    }
+
+    // Integers are moved out of the following indices (LE/BE):
+    // 8-bit -> 8/7
+    // 16-bit -> 3/4
+    // 32-bit -> 2/1
+    // 64-bit -> 1/0
+    auto isAtNaturalIndex = [=](int BitWidth, bool IsLE) -> bool {
+      if (IsLE)
+        return (BitWidth == 64 && Index == 1) ||
+          (BitWidth == 32 && Index == 2) ||
+          (BitWidth == 16 && Index == 3) ||
+          (BitWidth = 8 && Index == 8);
+      else
+        return (BitWidth == 64 && Index == 0) ||
+          (BitWidth == 32 && Index == 1) ||
+          (BitWidth == 16 && Index == 4) ||
+          (BitWidth == 8 && Index == 7);
+      return false;
+    };
+
+    if (Val->isIntegerTy() && ST->hasDirectMove() &&
+        isAtNaturalIndex(Val->getScalarSizeInBits(), ST->isLittleEndian()))
+      return DirectMoveCost;
+    else if (ST->hasDirectMove() && Index != -1U)
+      // Otherwise they need repositioning in the vector register.
+      return DirectMoveCost + 1;
+  } else if (ISD == ISD::INSERT_VECTOR_ELT) {
+    // Doubles only need a variant of XXPERMDI to be inserted (when the index
+    // is constant).
+    if (ST->hasVSX() && Val->getScalarType()->isDoubleTy() &&
+        Index != -1U)
+      return 1;
+
+    // On Power9 we do not need to permute the vectors when inserting a 32-bit
+    // value. Floating point gets converted and inserted, integers are moved and
+    // inserted.
+    if (ST->hasP9Vector() && Val->getScalarType()->isFloatTy() &&
+        Index != -1U)
+      return 2; // Convert + insert
+    if (ST->hasP9Vector() && Val->getScalarType()->isIntegerTy(32) &&
+        Index != -1U)
+      return DirectMoveCost + 1; // Move + insert
+
+    // 64-bit integers need a direct move and a variant of XXPERMDI to be
+    // inserted (when the index is constant).
+    if (ST->hasDirectMove() && Val->getScalarType()->isIntegerTy(64) &&
+        Index != -1U)
+      return DirectMoveCost + 1;
+
+    // Smaller integers are inserted by moving into the VSR, loading a permute
+    // mask and permuting the vectors.
+    if (ST->hasDirectMove() && Val->getScalarType()->isIntegerTy() &&
+        Index != -1U)
+      return DirectMoveCost + 3;
+  } else
+    return BaseT::getVectorInstrCost(Opcode, Val, Index);
+
+  // Other inserts/extracts will incur an LHS penalty.
+
   // Estimated cost of a load-hit-store delay.  This was obtained
   // experimentally as a minimum needed to prevent unprofitable
   // vectorization for the paq8p benchmark.  It may need to be
Index: test/Analysis/CostModel/PowerPC/insert_extract.ll
===================================================================
--- test/Analysis/CostModel/PowerPC/insert_extract.ll
+++ test/Analysis/CostModel/PowerPC/insert_extract.ll
@@ -9,7 +9,7 @@
 }
 
 define i32 @extract(<4 x i32> %arg) {
-  ; CHECK: cost of 3 {{.*}} extractelement
+  ; CHECK: cost of 5 {{.*}} extractelement
   %x = extractelement <4 x i32> %arg, i32 0
   ret i32 %x
 }
Index: test/CodeGen/PowerPC/swaps-le-5.ll
===================================================================
--- test/CodeGen/PowerPC/swaps-le-5.ll
+++ test/CodeGen/PowerPC/swaps-le-5.ll
@@ -15,11 +15,12 @@
 }
 
 ; CHECK-LABEL: @bar0
-; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
-; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
-; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1
-; CHECK: stxvd2x [[REG3]]
+; CHECK-DAG: xxswapd [[REG1:[0-9]+]], 1
+; CHECK-DAG: lxvd2x [[REG2:[0-9]+]]
+; CHECK-NOT: xxswapd
+; CHECK:xxmrgld [[REG3:[0-9]+]], [[REG1]], [[REG2]]
 ; CHECK-NOT: xxswapd
+; CHECK: stxvd2x [[REG3]]
 
 define void @bar1(double %y) {
 entry:
@@ -30,9 +31,9 @@
 }
 
 ; CHECK-LABEL: @bar1
-; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
-; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
-; CHECK: xxmrghd [[REG3:[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-DAG: xxswapd [[REG1:[0-9]+]], 1
+; CHECK-DAG: lxvd2x [[REG2:[0-9]+]]
+; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1
 ; CHECK: stxvd2x [[REG3]]
 ; CHECK-NOT: xxswapd
 
Index: test/CodeGen/PowerPC/swaps-le-6.ll
===================================================================
--- test/CodeGen/PowerPC/swaps-le-6.ll
+++ test/CodeGen/PowerPC/swaps-le-6.ll
@@ -20,8 +20,9 @@
 ; CHECK-LABEL: @bar0
 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
 ; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
-; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
-; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1
+; CHECK: xxswapd [[REG4:[0-9]+]], [[REG2]]
+; CHECK: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]]
+; CHECK-NOT: xxswapd
 ; CHECK: stxvd2x [[REG5]]
 
 define void @bar1() {
@@ -36,7 +37,8 @@
 ; CHECK-LABEL: @bar1
 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
 ; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
-; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
-; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]]
+; CHECK: xxswapd [[REG4:[0-9]+]], [[REG2]]
+; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1
+; CHECK-NOT: xxswapd
 ; CHECK: stxvd2x [[REG5]]
 
Index: test/CodeGen/PowerPC/vsx_insert_extract_le.ll
===================================================================
--- test/CodeGen/PowerPC/vsx_insert_extract_le.ll
+++ test/CodeGen/PowerPC/vsx_insert_extract_le.ll
@@ -10,8 +10,7 @@
 ; CHECK: lxvd2x 0, 0, 3
 ; CHECK: lxsdx 1, 0, 4
 ; CHECK: xxswapd 0, 0
-; CHECK: xxspltd 1, 1, 0
-; CHECK: xxpermdi 34, 0, 1, 1
+; CHECK: xxmrghd 34, 0, 1
 }
 
 define <2 x double> @testi1(<2 x double>* %p1, double* %p2) {
@@ -24,8 +23,7 @@
 ; CHECK: lxvd2x 0, 0, 3
 ; CHECK: lxsdx 1, 0, 4
 ; CHECK: xxswapd 0, 0
-; CHECK: xxspltd 1, 1, 0
-; CHECK: xxmrgld 34, 1, 0
+; CHECK: xxpermdi 34, 1, 0, 1
 }
 
 define double @teste0(<2 x double>* %p1) {