Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -620,8 +620,8 @@
   SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
-                                 unsigned UnpackHigh) const;
+  SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
   bool canTreatAsByteVector(EVT VT) const;
Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -4448,6 +4448,92 @@
   return Op;
 }
 
+bool isZeroOrUndefVector(SDValue N) {
+  if (N->getOpcode() == ISD::SPLAT_VECTOR)
+    if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0)))
+      return Op->getZExtValue() == 0;
+  return N.isUndef() || ISD::isBuildVectorAllZeros(N.getNode());
+}
+
+// Keeps track of the bytes that would result after applying one or several
+// unpacks.
+struct UnpackInfo {
+  enum { ZEXT_VAL = -2 };
+  SmallVector<int, SystemZ::VectorBytes> UnpackedBytes;
+  SmallVector<unsigned, 4> AppliedUnpacks;
+  SmallVector<int, SystemZ::VectorBytes> Mask;
+  bool NeedsPermute;
+  SDValue SourceOp;
+  UnpackInfo() : NeedsPermute(false) {
+    for (unsigned El = 0; El < SystemZ::VectorBytes; El++)
+      UnpackedBytes.push_back(El);
+  }
+
+  // Apply one unpack on UnpackedBytes and insert ZEXT_VAL on the bytes that
+  // become zero.
+  void applyUnpack(unsigned FromSize) {
+    SmallVector<int, SystemZ::VectorBytes> Tmp;
+    unsigned El = 0;
+    while (El < SystemZ::VectorBytes / 2) {
+      for (unsigned i = 0; i < FromSize; i++)
+        Tmp.push_back(ZEXT_VAL);
+      for (unsigned i = 0; i < FromSize; i++)
+        Tmp.push_back(UnpackedBytes[El++]);
+    }
+    UnpackedBytes = Tmp;
+    AppliedUnpacks.push_back(FromSize);
+  }
+
+  // Return true if the applied unpacks can be used to produce the result
+  // required by Bytes of Ops[OpIdx]. If the used source operand needs
+  // rearrangement before unpacking, NeedsPermute is set and Mask holds the
+  // required order. ZeroOpIdx is the index of the known zero vector in Ops.
+  bool tryApply(const SmallVectorImpl<int> &Bytes, SDValue *Ops,
+                unsigned ZeroOpIdx) {
+    unsigned OpIdx = ZeroOpIdx == 0 ? 1 : 0;
+    Mask.assign(SystemZ::VectorBytes, -1);
+    NeedsPermute = false;
+    for (unsigned i = 0; i < SystemZ::VectorBytes; ++i) {
+      if (Bytes[i] == -1)
+        continue;
+      unsigned OpNo = Bytes[i] / SystemZ::VectorBytes;
+      unsigned Byte = Bytes[i] % SystemZ::VectorBytes;
+      if (OpNo == ZeroOpIdx && UnpackedBytes[i] == ZEXT_VAL)
+        continue;
+      if (OpNo != OpIdx || UnpackedBytes[i] == ZEXT_VAL)
+        return false;
+      Mask[UnpackedBytes[i]] = Byte;
+    }
+    for (unsigned i = 0; i < SystemZ::VectorBytes; ++i)
+      if (Mask[i] != -1 && Mask[i] != int(i)) {
+        NeedsPermute = true;
+        break;
+      }
+    SourceOp = Ops[OpIdx];
+    return true;
+  }
+
+  uint32_t getCost() {
+    if (AppliedUnpacks.empty() && !NeedsPermute)
+      return UINT32_MAX;
+    return AppliedUnpacks.size() + (NeedsPermute ? 4 : 0);
+  }
+};
+
+#ifndef NDEBUG
+static void dumpBytes(const SmallVectorImpl<int> &Bytes, std::string Msg) {
+  dbgs() << Msg.c_str() << " { ";
+  for (unsigned i = 0; i < Bytes.size(); i++) {
+    if (Bytes[i] == UnpackInfo::ZEXT_VAL)
+      dbgs() << "Z";
+    else
+      dbgs() << Bytes[i];
+    dbgs() << ", ";
+  }
+  dbgs() << "}\n";
+}
+#endif
+
 // Bytes is a VPERM-like permute vector, except that -1 is used for
 // undefined bytes.  Implement it on operands Ops[0] and Ops[1] using
 // VSLDB or VPERM.
@@ -4475,6 +4561,104 @@
   return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
 }
 
+// EXPERIMENTAL This can be used to tune the threshold as to when unpacking
+// would be used instead of a vperm. It is compared to the total number of
+// instrucions needed, which may include also shufffling of the source
+// operand before unpacking.
+static cl::opt<unsigned> MAXUNPACKOPS("max-unpackops", cl::init(4));
+
+// Detect cases where one of the source operands can be used while the other
+// bytes are zero or undefined.
+static SDValue tryShuffleWithZeroVec(SelectionDAG &DAG,
+                                     const SDLoc &DL, SDValue *Ops,
+                                     const SmallVectorImpl<int> &Bytes) {
+  // Don't use unpacks if a single VSLDB could be used.
+  unsigned StartIndex, OpNo0, OpNo1;
+  if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
+    return SDValue();
+
+  // Try unpack(s)
+  unsigned ZeroOpIdx = (isZeroOrUndefVector(Ops[0]) ? 0 :
+                        (isZeroOrUndefVector(Ops[1]) ? 1 : UINT32_MAX));
+  if (ZeroOpIdx == UINT32_MAX)
+    return SDValue();
+  UnpackInfo Best;
+  for (unsigned Start = 1; Start <= 4; Start *= 2) {
+    UnpackInfo UPI;
+    for (unsigned From = Start; From <= 4; From *= 2) {
+      UPI.applyUnpack(From);
+      if (UPI.tryApply(Bytes, Ops, ZeroOpIdx) && Best.getCost() > UPI.getCost())
+        Best = UPI;
+    }
+  }
+  if (Best.getCost() == UINT32_MAX || Best.AppliedUnpacks.size() > MAXUNPACKOPS)
+    return SDValue();
+
+  LLVM_DEBUG(dbgs() << "\nUnpacking:\n";
+             Ops[0].dump();
+             Ops[1].dump();
+             dumpBytes(Bytes, "Given this 'Bytes' mask:"););
+  LLVM_DEBUG(dbgs() << Best.AppliedUnpacks.size()
+             << " unpack(s) needed of t" <<
+             Best.SourceOp.getNode()->PersistentId << " to produce ";
+             dumpBytes(Best.UnpackedBytes, ""););
+  if (Best.NeedsPermute) {
+    LLVM_DEBUG(dumpBytes(Best.Mask, "Shuffling before unpacking with mask:"););
+    unsigned OpNo0, OpNo1;
+    if (Best.SourceOp->use_empty() &&
+        Best.SourceOp->getOpcode() == SystemZISD::PERMUTE) {
+      // If source is a PERMUTE then modify it instead of emitting another one.
+      SDValue OrigMask = Best.SourceOp->getOperand(2);
+      SDValue IndexNodes[SystemZ::VectorBytes];
+      for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
+        int MaskEl = Best.Mask[I];
+        if (MaskEl < 0 || OrigMask.getOperand(MaskEl).isUndef())
+          IndexNodes[I] = DAG.getUNDEF(MVT::i32);
+        else {
+          auto *COp = cast<ConstantSDNode>(OrigMask.getOperand(MaskEl));
+          IndexNodes[I] = DAG.getConstant(COp->getZExtValue(), DL, MVT::i32);
+        }
+      }
+      SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
+      Best.SourceOp = DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8,
+                                  Best.SourceOp.getOperand(0),
+                                  Best.SourceOp.getOperand(1), Op2);
+    }
+    else if (const Permute *P = matchPermute(Best.Mask, OpNo0, OpNo1)) {
+      if (Best.AppliedUnpacks.size() + 1 > MAXUNPACKOPS)
+        return SDValue();
+      Best.SourceOp = getPermuteNode(DAG, DL, *P, Best.SourceOp, Best.SourceOp);
+    }
+    else {
+      if (isShlDoublePermute(Best.Mask, StartIndex, OpNo0, OpNo1)) {
+        if (Best.AppliedUnpacks.size() + 1 > MAXUNPACKOPS)
+          return SDValue();
+      } else {
+        // In order to unpack, a permute of the source is needed. If it's not a
+        // VSLDB then only do this in the cases where a single unpack is needed
+        // after the VPERM.
+        if (Best.AppliedUnpacks.size() > 1)
+          return SDValue();
+      }
+      SDValue SubOps[] = { Best.SourceOp, Best.SourceOp};
+      Best.SourceOp = getGeneralPermuteNode(DAG, DL, SubOps, Best.Mask);
+    }
+  }
+
+  // Build the unpacks.
+  unsigned FromBits = Best.AppliedUnpacks.front() * 8;
+  EVT FromVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
+                                SystemZ::VectorBits / FromBits);
+  SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, FromVT, Best.SourceOp);
+  for (unsigned FromSize : Best.AppliedUnpacks) {
+    unsigned ToBits = FromSize * 8 * 2;
+    EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ToBits),
+                                 SystemZ::VectorBits / ToBits);
+    PackedOp = DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp);
+  }
+  return PackedOp;
+}
+
 namespace {
 // Describes a general N-operand vector shuffle.
 struct GeneralShuffle {
@@ -4579,6 +4763,39 @@
   if (Ops.size() == 1)
     Ops.push_back(DAG.getUNDEF(MVT::v16i8));
 
+  // Put a zero vector last to attempt unpacking as the last operation if
+  // there are more than two operands.
+  uint32_t ZeroVecOpNo = UINT32_MAX;
+  if (Ops.size() > 2) {
+    // Find the zero vector.
+    for (unsigned I = 0; I < Ops.size() ; I++)
+      if (ISD::isBuildVectorAllZeros(Ops[I].getNode()) ||
+          (Ops[I]->getOpcode() == ISD::SPLAT_VECTOR &&
+           isa<ConstantSDNode>(Ops[I]->getOperand(0)) &&
+           cast<ConstantSDNode>(Ops[I]->getOperand(0))->getZExtValue() == 0)) {
+        ZeroVecOpNo = I;
+        break;
+      }
+    // Move it to the last position without rearranging the others.
+    unsigned LastOpNo = Ops.size() - 1;
+    if (ZeroVecOpNo != UINT32_MAX && ZeroVecOpNo != LastOpNo) {
+      SDValue ZeroOp = Ops[ZeroVecOpNo];
+      for (unsigned I = ZeroVecOpNo; I < Ops.size() - 1 ; I++)
+        Ops[I] = Ops[I + 1];
+      Ops[LastOpNo] = ZeroOp;
+      for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
+        if (Bytes[I] >= 0) {
+          unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
+          if (OpNo > ZeroVecOpNo)
+            Bytes[I] -= SystemZ::VectorBytes;
+          else if (OpNo == ZeroVecOpNo)
+            Bytes[I] += SystemZ::VectorBytes * (LastOpNo - ZeroVecOpNo);
+        }
+      }
+      ZeroVecOpNo = LastOpNo;
+    }
+  }
+
   // Create a tree of shuffles, deferring root node until after the loop.
   // Try to redistribute the undefined elements of non-root nodes so that
   // the non-root shuffles match something like a pack or merge, then adjust
@@ -4589,8 +4806,14 @@
   // In the best case this redistribution will lead to the whole tree
   // using packs and merges.  It should rarely be a loss in other cases.
   unsigned Stride = 1;
-  for (; Stride * 2 < Ops.size(); Stride *= 2) {
+  // The Zero vector needs to be handled at last. Do this by skipping it
+  // below while doing one more iteration with the rest leaving only one
+  // other result.
+  unsigned ItersFactor = (ZeroVecOpNo == UINT32_MAX ? 2 : 1);
+  for (; Stride * ItersFactor < Ops.size(); Stride *= 2) {
     for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
+      if (I + Stride == ZeroVecOpNo)
+        continue;
       SDValue SubOps[] = { Ops[I], Ops[I + Stride] };
 
       // Create a mask for just these two operands.
@@ -4629,21 +4852,25 @@
   }
 
   // Now we just have 2 inputs.  Put the second operand in Ops[1].
-  if (Stride > 1) {
-    Ops[1] = Ops[Stride];
+  unsigned RHSOpNo = ZeroVecOpNo == UINT32_MAX ? Stride :ZeroVecOpNo;
+  if (RHSOpNo > 1) {
+    Ops[1] = Ops[RHSOpNo];
     for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
       if (Bytes[I] >= int(SystemZ::VectorBytes))
-        Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
+        Bytes[I] -= (RHSOpNo - 1) * SystemZ::VectorBytes;
   }
-
   // Look for an instruction that can do the permute without resorting
   // to VPERM.
   unsigned OpNo0, OpNo1;
   SDValue Op;
   if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
     Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
-  else
-    Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
+  else {
+    Op = tryShuffleWithZeroVec(DAG, DL, &Ops[0], Bytes);
+    if (!Op.getNode())
+      Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
+  }
+
   return DAG.getNode(ISD::BITCAST, DL, VT, Op);
 }
 
@@ -5041,9 +5268,8 @@
   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
 }
 
-SDValue
-SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
-                                              unsigned UnpackHigh) const {
+SDValue SystemZTargetLowering::
+lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
   SDValue PackedOp = Op.getOperand(0);
   EVT OutVT = Op.getValueType();
   EVT InVT = PackedOp.getValueType();
@@ -5053,11 +5279,39 @@
     FromBits *= 2;
     EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
                                  SystemZ::VectorBits / FromBits);
-    PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
+    PackedOp =
+      DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp);
   } while (FromBits != ToBits);
   return PackedOp;
 }
 
+// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector.
+SDValue SystemZTargetLowering::
+lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
+  SDValue PackedOp = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT OutVT = Op.getValueType();
+  EVT InVT = PackedOp.getValueType();
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned OutNumElts = OutVT.getVectorNumElements();
+  unsigned NumInPerOut = InNumElts / OutNumElts;
+
+  SDValue ZeroVec =
+    DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType()));
+
+  SmallVector<int, 16> Mask(InNumElts);
+  unsigned ZeroVecElt = InNumElts;
+  for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) {
+    unsigned MaskElt = PackedElt * NumInPerOut;
+    unsigned End = MaskElt + NumInPerOut - 1;
+    for (; MaskElt < End; MaskElt++)
+      Mask[MaskElt] = ZeroVecElt++;
+    Mask[MaskElt] = PackedElt;
+  }
+  SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask);
+  return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf);
+}
+
 SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
                                           unsigned ByScalar) const {
   // Look for cases where a vector shift can use the *_BY_SCALAR form.
@@ -5223,9 +5477,9 @@
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::SIGN_EXTEND_VECTOR_INREG:
-    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
+    return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG);
   case ISD::ZERO_EXTEND_VECTOR_INREG:
-    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
+    return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
   case ISD::SHL:
     return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
   case ISD::SRL:
Index: llvm/test/CodeGen/SystemZ/vec-move-24.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/vec-move-24.ll
@@ -0,0 +1,128 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+;
+; Test that not vperm is used if not necessary in some cases.
+
+; Zero extension of a vector compare.
+define <4 x i32> @fun0(<4 x i8> %arg0, <4 x i8> %arg1) nounwind {
+; CHECK-LABEL: fun0:
+; CHECK-NOT: vperm
+  %tmp = icmp uge <4 x i8> %arg0, %arg1
+  %tmp1 = zext <4 x i1> %tmp to <4 x i32>
+  ret <4 x i32> %tmp1
+}
+
+; Don't do unpacks + vperm if vsldb can be used
+define <4 x i16> @fun1() {
+; CHECK-LABEL: fun1:
+; CHECK: vlrepg
+; CHECK-NEXT: vsldb
+; CHECK-NEXT: veslh
+; CHECK-NEXT: br
+  %tmp = load <8 x i8>, <8 x i8>* undef
+  %tmp3 = shufflevector <8 x i8> %tmp, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
+  %tmp5 = shl nuw <4 x i16> %tmp4, <i16 8, i16 8, i16 8, i16 8>
+  ret <4 x i16> %tmp5
+}
+
+; Some cases of zero extensions by shuffling with a zero vector.
+define <16 x i8> @fun2(<16 x i8>* %Src) nounwind {
+; CHECK-LABEL: fun2:
+; CHECK-NOT: vperm
+  %tmp = load <16 x i8>, <16 x i8>* %Src
+  %tmp2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %tmp, <16 x i32> <i32 0, i32 1, i32 2, i32 16, i32 4, i32 5, i32 6, i32 17, i32 8, i32 9, i32 10, i32 18, i32 12, i32 13, i32 14, i32 19>
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @fun3(<8 x i16>* %Src) nounwind {
+; CHECK-LABEL: fun3:
+; CHECK-NOT: vperm
+  %tmp = load <8 x i16>, <8 x i16>* %Src
+  %tmp2 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 3, i32 4, i32 5, i32 9>
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @fun4(<4 x i32>* %Src) nounwind {
+; CHECK-LABEL: fun4:
+; CHECK-NOT: vperm
+  %tmp = load <4 x i32>, <4 x i32>* %Src
+  %tmp2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> <i32 0, i32 4, i32 2, i32 5>
+  ret <4 x i32> %tmp2
+}
+
+; This does not require a vperm, but only a single unpack.
+define  void @fun5(i8 %Src, <32 x i8>* %Dst) nounwind {
+; CHECK-LABEL: fun5:
+; CHECK-NOT: vperm
+  %I0 = insertelement <16 x i8> undef, i8 %Src, i32 0
+  %I1 = insertelement <16 x i8> %I0, i8 %Src, i32 1
+  %I2 = insertelement <16 x i8> %I1, i8 %Src, i32 2
+  %I3 = insertelement <16 x i8> %I2, i8 %Src, i32 3
+  %I4 = insertelement <16 x i8> %I3, i8 %Src, i32 4
+  %I5 = insertelement <16 x i8> %I4, i8 %Src, i32 5
+  %I6 = insertelement <16 x i8> %I5, i8 %Src, i32 6
+  %I7 = insertelement <16 x i8> %I6, i8 %Src, i32 7
+  %I8 = insertelement <16 x i8> %I7, i8 %Src, i32 8
+  %I9 = insertelement <16 x i8> %I8, i8 %Src, i32 9
+  %I10 = insertelement <16 x i8> %I9, i8 %Src, i32 10
+  %I11 = insertelement <16 x i8> %I10, i8 %Src, i32 11
+  %I12 = insertelement <16 x i8> %I11, i8 %Src, i32 12
+  %I13 = insertelement <16 x i8> %I12, i8 %Src, i32 13
+  %I14 = insertelement <16 x i8> %I13, i8 %Src, i32 14
+  %I15 = insertelement <16 x i8> %I14, i8 %Src, i32 15
+
+  %tmp = shufflevector <16 x i8> zeroinitializer,
+                       <16 x i8> %I15,
+                       <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                   i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                   i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                   i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tmp9 = shufflevector <32 x i8> undef,
+                        <32 x i8> %tmp,
+                        <32 x i32> <i32 33, i32 32, i32 48, i32 49, i32 1, i32 17, i32 50, i32 51,
+                                    i32 2, i32 18, i32 52, i32 53, i32 3, i32 19, i32 54, i32 55,
+                                    i32 4, i32 20, i32 56, i32 57, i32 5, i32 21, i32 58, i32 59,
+                                    i32 6, i32 22, i32 60, i32 61, i32 7, i32 62, i32 55, i32 63>
+
+  store <32 x i8> %tmp9, <32 x i8>* %Dst
+  ret void
+}
+
+; Unpacking after a vsldb, and not a vperm.
+define void @fun6()  {
+; CHECK-LABEL: fun6:
+; CHECK-NOT: vperm
+  %tmp = load <16 x i8>, <16 x i8>* undef, align 1
+  %tmp2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %tmp, <16 x i32> <i32 0, i32 1, i32 2, i32 28, i32 4, i32 5, i32 6, i32 29, i32 8, i32 9, i32 10, i32 30, i32 12, i32 13, i32 14, i32 31>
+  store <16 x i8> %tmp2, <16 x i8>* undef
+  ret void
+}
+
+; This should only require one vperm.
+define void @fun7(<4 x i16>* %Src, <4 x i16>* %Src2, <4 x i32>* %Dst) {
+; CHECK-LABEL: fun7:
+; CHECK: vperm
+; CHECK-NOT: vperm
+  %l = load <4 x i16>, <4 x i16>* %Src
+  %l2 = load <4 x i16>, <4 x i16>* %Src2
+  %sh = shufflevector <4 x i16> %l, <4 x i16> %l2, <4 x i32> <i32 1, i32 0, i32 5, i32 6>
+  %z = zext <4 x i16> %sh to <4 x i32>
+  store <4 x i32> %z, <4 x i32>* %Dst
+  ret void
+}
+
+; Should only require two vperms.
+define void @fun8()  {
+; CHECK-LABEL: fun8:
+; CHECK: vperm
+; CHECK: vperm
+; CHECK-NOT: vperm
+  %tmp = load <48 x i8>, <48 x i8>* null, align 1
+  %tmp6 = shufflevector <48 x i8> %tmp, <48 x i8> undef, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
+  %tmp7 = zext <8 x i8> %tmp6 to <8 x i16>
+  %tmp8 = shufflevector <8 x i16> %tmp7, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp9 = shufflevector <16 x i16> %tmp8, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tmp10 = shufflevector <32 x i16> %tmp9, <32 x i16> undef, <40 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 1, i32 9, i32 17, i32 25, i32 33, i32 2, i32 10, i32 18, i32 26, i32 34, i32 3, i32 11, i32 19, i32 27, i32 35, i32 4, i32 12, i32 20, i32 28, i32 36, i32 5, i32 13, i32 21, i32 29, i32 37, i32 6, i32 14, i32 22, i32 30, i32 38, i32 7, i32 15, i32 23, i32 31, i32 39>
+  store <40 x i16> %tmp10, <40 x i16>* undef, align 2
+  ret void
+}