diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3410,6 +3410,10 @@
                                        bool LegalOperations, bool ForCodeSize,
                                        unsigned Depth = 0) const;
 
+  /// If it's profitable to split vector store into individual stores.
+  virtual bool isCheapToSplitStore(StoreSDNode *N, unsigned NumSplit,
+                                   SelectionDAG &DAG) const;
+
   //===--------------------------------------------------------------------===//
   // Lowering methods - These methods must be implemented by targets so that
   // the SelectionDAGBuilder code knows how to lower these.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -547,6 +547,7 @@
     SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue MatchLoadCombine(SDNode *N);
     SDValue MatchStoreCombine(StoreSDNode *N);
+    SDValue MatchVectorStoreSplit(StoreSDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
@@ -6612,6 +6613,211 @@
   return NewStore;
 }
 
+/// getVectorUpdateChain - Detect a load-insert/shuffle-store chain. Returns
+/// head LOAD if we met such pattern. Otherwise, this function returns null
+/// SDValue. The path here contains both nodes on the path and respective
+/// 'next' node index we should visit.
+static SDValue
+getVectorUpdatePath(SDValue Current,
+                    SmallVectorImpl<std::pair<SDValue, unsigned>> &Path) {
+  if (Current.getOpcode() == ISD::LOAD)
+    return SDValue();
+
+  const unsigned MaxDepth = 12;
+  unsigned Depth = 0;
+
+  // Here we use DFS to find the LOAD we desire. This buffer keeps nodes
+  // we've not visited. If current path is wrong, back and pick another one.
+  SmallVector<SDValue, 16> SearchBuffer;
+
+  while (Current) {
+    if (Depth >= MaxDepth)
+      return SDValue();
+
+    if (Current.getOpcode() == ISD::LOAD)
+      return Current;
+    else if (Current.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+    // For INSERT_ELT nodes, we only have one path.
+      ++Depth;
+      Path.push_back(std::make_pair(Current, 0));
+      Current = Current.getOperand(0);
+    } else if (Current.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    // For VECTOR_SHUFFLE, we pick the first non-constant and push
+    // the rest into buffer.
+      ++Depth;
+      SDValue Op1 = Current.getOperand(0);
+      SDValue Op2 = Current.getOperand(1);
+
+      // Since we're desiring a LOAD at root of the path, constant vector
+      // (BUILD_VECTOR) operand in shuffle instr can't be what we want.
+      if (Op1.getOpcode() == ISD::BUILD_VECTOR) {
+        Path.push_back(std::make_pair(Current, 1));
+        Current = Op2;
+      } else {
+        Path.push_back(std::make_pair(Current, 0));
+        Current = Op1;
+        if (Op2.getOpcode() != ISD::BUILD_VECTOR)
+          SearchBuffer.push_back(Op2);
+      }
+    } else {
+      if (SearchBuffer.empty())
+        return SDValue();
+
+      // Go back until we find top of buffer is 'brother' of current node.
+      // And pick it from buffer.
+      while (!Path.empty()) {
+        std::pair<SDValue, unsigned> Top = Path.pop_back_val();
+        if (Top.first.getOpcode() != ISD::VECTOR_SHUFFLE)
+          continue;
+        unsigned NewPath = !Top.second;
+        if (Top.first.getOperand(NewPath) == SearchBuffer.back()) {
+          SearchBuffer.pop_back();
+          Path.push_back(std::make_pair(Top.first, NewPath));
+          Current = Top.first.getOperand(NewPath);
+          break;
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// getVectorUpdates - Update a state table of elements generated by
+/// a load-insert/shuffle-store chain. We can use this table to simplify
+/// vector store code.
+static bool
+getVectorUpdates(SmallVectorImpl<std::pair<SDValue, int>> &States,
+                 SmallVectorImpl<std::pair<SDValue, unsigned>> &Worklist,
+                 const SDValue &Source) {
+  bool Changed = false;
+  auto VecLen = Source.getValueType().getVectorNumElements();
+  SDValue LastItem;
+
+  // Each element in States should be one of:
+  // - The first is default SDValue if it's undef.
+  // - The first is scalar if it's a value representing scalar.
+  // - The first is vector and second is index if it's element from a vector.
+
+  while (!Worklist.empty()) {
+    auto Tail = Worklist.pop_back_val();
+
+    // INSERT only changes one element, so just change entry for that one.
+    if (Tail.first.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+      auto IndexNode = dyn_cast<ConstantSDNode>(Tail.first.getOperand(2));
+      if (IndexNode == nullptr)
+        return false;
+      auto IndexValue = IndexNode->getZExtValue();
+      if (IndexValue >= VecLen)
+        return false;
+      States[IndexValue].first = Tail.first.getOperand(1);
+      Changed = true;
+    } else if (Tail.first.getOpcode() == ISD::VECTOR_SHUFFLE) {
+      // Elements may got exchanged. So we copy the table and refer
+      // to the original ones when updating.
+      const ShuffleVectorSDNode *SVN =
+          dyn_cast<ShuffleVectorSDNode>(Tail.first.getNode());
+      SmallVector<std::pair<SDValue, int>, 16> OriginalStates(States.begin(),
+                                                              States.end());
+      // We don't support shuffle which changes vector length.
+      auto MaskSize = SVN->getMask().size();
+      if (MaskSize != VecLen)
+        return false;
+
+      for (unsigned i = 0; i < MaskSize; i++) {
+        int Mask = SVN->getMaskElt(i);
+        if (Mask < 0)
+          States[i].first = SDValue();
+        else {
+          SDValue ChoosenVec = SVN->getOperand(Mask >= (int)VecLen);
+          States[i].second = (Mask < (int)VecLen) ? Mask : Mask - VecLen;
+
+          // Keep previous set vector if shuffle doesn't change this.
+          if (ChoosenVec != LastItem)
+            States[i].first = ChoosenVec;
+        }
+
+        if (OriginalStates[i] != States[i])
+          Changed = true;
+      }
+    }
+
+    LastItem = Tail.first;
+  }
+
+  return Changed;
+}
+
+/// Match a pattern where only a few elements of a vector are updated but the
+/// whole vector is stored. This method will recognize a chain in which vector
+/// is updated, and split the store into multiple element stores.
+SDValue DAGCombiner::MatchVectorStoreSplit(StoreSDNode *N) {
+  SmallVector<std::pair<SDValue, unsigned>, 16> Buf;
+  SmallVector<std::pair<SDValue, int>, 16> States;
+
+  // We don't support scalable vectors now.
+  EVT VecType = N->getValue().getValueType();
+  EVT EleVT = VecType.getScalarType();
+
+  if (!N->isSimple() || VecType.isScalableVector() || !VecType.isVector())
+    return SDValue();
+
+  if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), EleVT) ||
+      !hasOperation(N->getOpcode(), EleVT))
+    return SDValue();
+
+  SDValue SourceVecLoad = getVectorUpdatePath(N->getValue(), Buf);
+  LoadSDNode *LoadNode = dyn_cast_or_null<LoadSDNode>(SourceVecLoad.getNode());
+  if (!LoadNode || !LoadNode->isSimple())
+    return SDValue();
+
+  // Prepare a table, initialized by the vector and its index. So we can find
+  // which elements got updated in the chain.
+  auto VecLen = SourceVecLoad.getValueType().getVectorNumElements();
+  for (unsigned i = 0; i < VecLen; ++i)
+    States.push_back(std::make_pair(SourceVecLoad, i));
+
+  if (!getVectorUpdates(States, Buf, SourceVecLoad))
+    return SDValue();
+
+  // Go over the updated table to find updated elements.
+  SmallVector<int, 16> UpdatedElementsIdx;
+  for (unsigned i = 0; i < States.size(); ++i) {
+    SDValue &State = States[i].first;
+    if (!State)
+      continue;
+
+    // It's not profitable to extract from another vector and insert.
+    if (State.getValueType().isVector()) {
+      if (State.getOpcode() == ISD::BUILD_VECTOR) {
+        State = State.getOperand(States[i].second);
+        UpdatedElementsIdx.push_back(i);
+      } else if (State != SourceVecLoad || States[i].second != (int)i)
+        return SDValue();
+    } else
+      UpdatedElementsIdx.push_back(i);
+  }
+
+  if (TLI.isCheapToSplitStore(N, UpdatedElementsIdx.size(), DAG)) {
+    SDLoc Loc = SDLoc(N);
+    SDValue Index, Chain;
+
+    // Insert new stores, based on vector's address.
+    for (unsigned i = 0; i < UpdatedElementsIdx.size(); ++i) {
+      Index = TLI.getVectorElementPointer(
+          DAG, N->getBasePtr(), VecType,
+          DAG.getConstant(UpdatedElementsIdx[i], Loc, MVT::i32));
+      Chain =
+          DAG.getStore(Chain ? Chain : N->getChain(), Loc,
+                       States[UpdatedElementsIdx[i]].first,
+                       Index, N->getPointerInfo());
+    }
+    return Chain;
+  }
+
+  return SDValue();
+}
+
 /// Match a pattern where a wide type scalar value is loaded by several narrow
 /// loads and combined by shifts and ors. Fold it into a single load or a load
 /// and a BSWAP if the targets supports it.
@@ -16192,6 +16398,9 @@
   if (SDValue Store = MatchStoreCombine(ST))
     return Store;
 
+  if (SDValue Split = MatchVectorStoreSplit(ST))
+    return Split;
+
   if (ST->isUnindexed()) {
     // Walk up chain skipping non-aliasing memory nodes, on this store and any
     // adjacent stores.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5580,6 +5580,11 @@
   llvm_unreachable("Unknown code");
 }
 
+bool TargetLowering::isCheapToSplitStore(StoreSDNode *N, unsigned NumSplit,
+                                         SelectionDAG &DAG) const {
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Legalization Utilities
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -646,6 +646,9 @@
       return true;
     }
 
+    bool isCheapToSplitStore(StoreSDNode *N, unsigned NumSplit,
+                             SelectionDAG &DAG) const override;
+
     bool isCtlzFast() const override {
       return true;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1598,6 +1598,17 @@
   return true;
 }
 
+bool PPCTargetLowering::isCheapToSplitStore(StoreSDNode *N,
+                                            unsigned NumSplit,
+                                            SelectionDAG &DAG) const {
+  EVT VecType = N->getValue().getValueType();
+
+  if (!VecType.isVector() || NumSplit >= VecType.getVectorNumElements())
+    return false;
+
+  return true;
+}
+
 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
 /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
 /// The ShuffleKind distinguishes between big-endian merges with two
diff --git a/llvm/test/CodeGen/PowerPC/swaps-le-5.ll b/llvm/test/CodeGen/PowerPC/swaps-le-5.ll
--- a/llvm/test/CodeGen/PowerPC/swaps-le-5.ll
+++ b/llvm/test/CodeGen/PowerPC/swaps-le-5.ll
@@ -10,7 +10,8 @@
 entry:
   %0 = load <2 x double>, <2 x double>* @x, align 16
   %vecins = insertelement <2 x double> %0, double %y, i32 0
-  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ; Add volatile to avoid vector store split optimization.
+  store volatile <2 x double> %vecins, <2 x double>* @z, align 16
   ret void
 }
 
@@ -25,7 +26,7 @@
 entry:
   %0 = load <2 x double>, <2 x double>* @x, align 16
   %vecins = insertelement <2 x double> %0, double %y, i32 1
-  store <2 x double> %vecins, <2 x double>* @z, align 16
+  store volatile <2 x double> %vecins, <2 x double>* @z, align 16
   ret void
 }
 
@@ -57,7 +58,7 @@
   %0 = load <2 x double>, <2 x double>* @z, align 16
   %1 = load <2 x double>, <2 x double>* @x, align 16
   %vecins = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 3, i32 1>
-  store <2 x double> %vecins, <2 x double>* @z, align 16
+  store volatile <2 x double> %vecins, <2 x double>* @z, align 16
   ret void
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/swaps-le-6.ll b/llvm/test/CodeGen/PowerPC/swaps-le-6.ll
--- a/llvm/test/CodeGen/PowerPC/swaps-le-6.ll
+++ b/llvm/test/CodeGen/PowerPC/swaps-le-6.ll
@@ -60,7 +60,8 @@
   %0 = load <2 x double>, <2 x double>* @x, align 16
   %1 = load double, double* @y, align 8
   %vecins = insertelement <2 x double> %0, double %1, i32 0
-  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ; Add volatile to avoid vector store split optimization.
+  store volatile <2 x double> %vecins, <2 x double>* @z, align 16
   ret void
 }
 
@@ -105,7 +106,7 @@
   %0 = load <2 x double>, <2 x double>* @x, align 16
   %1 = load double, double* @y, align 8
   %vecins = insertelement <2 x double> %0, double %1, i32 1
-  store <2 x double> %vecins, <2 x double>* @z, align 16
+  store volatile <2 x double> %vecins, <2 x double>* @z, align 16
   ret void
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/vector-store-split.ll b/llvm/test/CodeGen/PowerPC/vector-store-split.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-store-split.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=LE,CHECK
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec < %s | FileCheck %s --check-prefixes=BE,CHECK
+
+; CHECK-LABEL: insert_store:
+; CHECK: stw 4, 12(3)
+; CHECK-NEXT: blr
+define void @insert_store(<4 x i32>* %q, i32 zeroext %s) {
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %q, align 16
+  %vecins = insertelement <4 x i32> %0, i32 %s, i32 3
+  store <4 x i32> %vecins, <4 x i32>* %q, align 16
+  ret void
+}
+
+; CHECK-LABEL: volatile_check:
+; LE: vperm
+; BE: vsldoi
+; CHECK: stvx
+; CHECK: blr
+define void @volatile_check(<4 x i32>* %q, <4 x i32>* %p, i32 zeroext %s) {
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %q, align 16
+  %vecins = insertelement <4 x i32> %0, i32 %s, i32 3
+  store volatile <4 x i32> %vecins, <4 x i32>* %q, align 16
+  %1 = load volatile <4 x i32>, <4 x i32>* %p, align 16
+  %vecins1 = insertelement <4 x i32> %1, i32 %s, i32 3
+  store <4 x i32> %vecins1, <4 x i32>* %p, align 16
+  ret void
+}
+
+; CHECK-LABEL: shuffle_store:
+; CHECK: li 4, 100
+; LE-NEXT: li 5, 300
+; LE-NEXT: stw 4, 12(3)
+; LE-NEXT: stw 5, 4(3)
+; BE-NEXT: stw 4, 12(3)
+; BE-NEXT: li 4, 300
+; BE-NEXT: stw 4, 4(3)
+; CHECK-NEXT: blr
+define void @shuffle_store(<4 x i32>* %q) {
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %q, align 16
+  %vecins3 = shufflevector <4 x i32> %0, <4 x i32> <i32 undef, i32 100, i32 10, i32 300>, <4 x i32> <i32 0, i32 7, i32 2, i32 5>
+  store <4 x i32> %vecins3, <4 x i32>* %q, align 16
+  ret void
+}
+
+; CHECK-LABEL: chain_shuffle:
+; CHECK: lwz 4, 0(4)
+; CHECK-NEXT: li 5, 2
+; CHECK-NEXT: stw 5, 4(3)
+; CHECK-NEXT: stw 4, 8(3)
+; CHECK-NEXT: blr
+define void @chain_shuffle(<4 x i32>* %q, i32* %m, i32 zeroext %s) {
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %q, align 16
+  %1 = load i32, i32* %m, align 4
+  %vecins0 = insertelement <4 x i32> %0, i32 %s, i32 1
+  %vecins1 = shufflevector <4 x i32> %vecins0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>, <4 x i32> <i32 0, i32 6, i32 undef, i32 3>
+  %vecins2 = insertelement <4 x i32> %vecins1, i32 %1, i32 2
+  store <4 x i32> %vecins2, <4 x i32>* %q, align 16
+  ret void
+}
+
+; CHECK-LABEL: exchange_elements:
+; LE: mtvsrd
+; LE: vperm
+; LE: xxswapd
+; LE: lvx
+; LE: vperm
+; LE: stvx
+; BE-COUNT-3: lvx
+; BE-COUNT-4: vsldoi
+; BE: stvx
+define void @exchange_elements(<4 x i32>* %a, <4 x i32>* %b) {
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %a, align 16
+  %1 = load <4 x i32>, <4 x i32>* %b, align 16
+  %vecins0 = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 0, i32 3, i32 7, i32 2>
+  %vecins1 = insertelement <4 x i32> %vecins0, i32 5, i32 0
+  store <4 x i32> %vecins1, <4 x i32>* %a, align 16
+  ret void
+}