diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10040,56 +10040,87 @@
   // perfect shuffle table to emit an optimal matching sequence.
   ArrayRef<int> PermMask = SVOp->getMask();
 
-  unsigned PFIndexes[4];
-  bool isFourElementShuffle = true;
-  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
-    unsigned EltNo = 8;   // Start out undef.
-    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
-      if (PermMask[i*4+j] < 0)
-        continue;   // Undef, ignore it.
-
-      unsigned ByteSource = PermMask[i*4+j];
-      if ((ByteSource & 3) != j) {
-        isFourElementShuffle = false;
-        break;
+  // Record all shuffle masks in the DAG. The first integer in value part is
+  // occurance of the mask, and the second means whether it occured multiple
+  // times, because the integer will decrement and all entries should be removed
+  // after visiting current DAG.
+  static DenseMap<ArrayRef<int>, int> MaskMap;
+  static const SDNode *CurDAGRoot = DAG.getRoot().getNode();
+
+  if (CurDAGRoot != DAG.getRoot().getNode()) {
+    CurDAGRoot = DAG.getRoot().getNode();
+    MaskMap.clear();
+  }
+
+  if (MaskMap.find(PermMask) == MaskMap.end()) {
+    for (const SDNode &Node : DAG.allnodes()) {
+      if (const auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(&Node)) {
+        ArrayRef<int> TheMask = Shuffle->getMask();
+        if (MaskMap.find(TheMask) == MaskMap.end())
+          MaskMap.insert(std::make_pair(TheMask, 0));
+        MaskMap[TheMask] += 1;
       }
+    }
+  }
 
-      if (EltNo == 8) {
-        EltNo = ByteSource/4;
-      } else if (EltNo != ByteSource/4) {
-        isFourElementShuffle = false;
-        break;
+  // When there're multiple shuffles sharing the same mask, we think it's
+  // cheaper to use load+vperm.
+  if (MaskMap[PermMask] == 1) {
+    unsigned PFIndexes[4];
+    bool isFourElementShuffle = true;
+    for (unsigned i = 0; i != 4 && isFourElementShuffle;
+         ++i) {                           // Element number
+      unsigned EltNo = 8;                 // Start out undef.
+      for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
+        if (PermMask[i * 4 + j] < 0)
+          continue; // Undef, ignore it.
+
+        unsigned ByteSource = PermMask[i * 4 + j];
+        if ((ByteSource & 3) != j) {
+          isFourElementShuffle = false;
+          break;
+        }
+
+        if (EltNo == 8) {
+          EltNo = ByteSource / 4;
+        } else if (EltNo != ByteSource / 4) {
+          isFourElementShuffle = false;
+          break;
+        }
       }
+      PFIndexes[i] = EltNo;
+    }
+
+    // Remove entry of this mask from map since it's unique in current DAG.
+    MaskMap.erase(PermMask);
+
+    // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
+    // perfect shuffle vector to determine if it is cost effective to do this as
+    // discrete instructions, or whether we should use a vperm.
+    // For now, we skip this for little endian until such time as we have a
+    // little-endian perfect shuffle table.
+    if (isFourElementShuffle && !isLittleEndian) {
+      // Compute the index in the perfect shuffle table.
+      unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                              PFIndexes[2] * 9 + PFIndexes[3];
+
+      unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+      unsigned Cost = (PFEntry >> 30);
+
+      // Determining when to avoid vperm is tricky.  Many things affect the cost
+      // of vperm, particularly how many times the perm mask needs to be
+      // computed. For example, if the perm mask can be hoisted out of a loop or
+      // is already used (perhaps because there are multiple permutes with the
+      // same shuffle mask?) the vperm has a cost of 1.  OTOH, hoisting the
+      // permute mask out of the loop requires an extra register.
+      //
+      // As a compromise, we only emit discrete instructions if the shuffle can
+      // be generated in 3 or fewer operations.  When we have loop information
+      // available, if this block is within a loop, we should avoid using vperm
+      // for 3-operation perms and use a constant pool load instead.
+      if (Cost < 3)
+        return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
     }
-    PFIndexes[i] = EltNo;
-  }
-
-  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
-  // perfect shuffle vector to determine if it is cost effective to do this as
-  // discrete instructions, or whether we should use a vperm.
-  // For now, we skip this for little endian until such time as we have a
-  // little-endian perfect shuffle table.
-  if (isFourElementShuffle && !isLittleEndian) {
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex =
-      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
-
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost  = (PFEntry >> 30);
-
-    // Determining when to avoid vperm is tricky.  Many things affect the cost
-    // of vperm, particularly how many times the perm mask needs to be computed.
-    // For example, if the perm mask can be hoisted out of a loop or is already
-    // used (perhaps because there are multiple permutes with the same shuffle
-    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
-    // the loop requires an extra register.
-    //
-    // As a compromise, we only emit discrete instructions if the shuffle can be
-    // generated in 3 or fewer operations.  When we have loop information
-    // available, if this block is within a loop, we should avoid using vperm
-    // for 3-operation perms and use a constant pool load instead.
-    if (Cost < 3)
-      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
   // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
diff --git a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
--- a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
@@ -38,12 +38,11 @@
 define <4 x float> @shuffle3(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x i8> %v4) {
 ; BE-LABEL: shuffle3:
 ; BE:       # %bb.0:
-; BE-NEXT:    vmrglw 0, 2, 3
-; BE-NEXT:    vmrghw 2, 2, 3
-; BE-NEXT:    vmrglw 3, 4, 5
-; BE-NEXT:    vmrghw 4, 4, 5
-; BE-NEXT:    vmrghw 2, 2, 0
-; BE-NEXT:    vmrghw 3, 4, 3
+; BE-NEXT:    addis 3, 2, .LCPI2_0@toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_0@toc@l
+; BE-NEXT:    lxv 32, 0(3)
+; BE-NEXT:    vperm 2, 2, 3, 0
+; BE-NEXT:    vperm 3, 4, 5, 0
 ; BE-NEXT:    xvaddsp 34, 34, 35
 ; BE-NEXT:    blr
 ;