Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -620,8 +620,8 @@
   SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
-                                 unsigned UnpackHigh) const;
+  SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
   bool canTreatAsByteVector(EVT VT) const;
Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -4448,6 +4448,85 @@
   return Op;
 }
 
+static bool isZeroOrUndefVector(SDValue N) {
+  if (N->getOpcode() == ISD::SPLAT_VECTOR)
+    if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0)))
+      return Op->getZExtValue() == 0;
+  return N.isUndef() || ISD::isBuildVectorAllZeros(N.getNode());
+}
+
+// Return the index of the zero/undef vector, or UINT32_MAX if not found.
+static uint32_t findZeroOrUndefVectorIdx(SDValue *Ops, unsigned Num) {
+  for (unsigned I = 0; I < Num ; I++)
+    if (isZeroOrUndefVector(Ops[I]))
+      return I;
+  return UINT32_MAX;
+}
+
+// Keeps track of the bytes that would result after applying one or several
+// unpacks.
+struct UnpackInfo {
+  enum { ZEXT_VAL = -2 };
+  bool NeedsPermute;
+  SmallVector<int, SystemZ::VectorBytes> Mask;
+  SDValue SourceOp;
+  UnpackInfo() : NeedsPermute(false) {}
+
+  // Return true if an unpack can be used to produce the result required by
+  // Bytes. If the used source operand needs rearrangement before unpacking,
+  // NeedsPermute is set and Mask holds the required order. ZeroOpIdx is the
+  // index of the known zero vector in Ops.
+  bool tryApply(unsigned FromEltSize, const SmallVectorImpl<int> &Bytes,
+                SDValue *Ops, unsigned ZeroOpIdx) {
+    // Apply an unpack of FromEltSize on UnpackedBytes and insert ZEXT_VAL on
+    // the bytes that become zero.
+    SmallVector<int, SystemZ::VectorBytes> UnpackedBytes;
+    unsigned El = 0;
+    while (El < SystemZ::VectorBytes / 2) {
+      for (unsigned i = 0; i < FromEltSize; i++)
+        UnpackedBytes.push_back(ZEXT_VAL);
+      for (unsigned i = 0; i < FromEltSize; i++)
+        UnpackedBytes.push_back(El++);
+    }
+
+    unsigned OpIdx = ZeroOpIdx == 0 ? 1 : 0;
+    Mask.assign(SystemZ::VectorBytes, -1);
+    NeedsPermute = false;
+    for (unsigned i = 0; i < SystemZ::VectorBytes; ++i) {
+      if (Bytes[i] == -1)
+        continue;
+      unsigned OpNo = Bytes[i] / SystemZ::VectorBytes;
+      unsigned Byte = Bytes[i] % SystemZ::VectorBytes;
+      if (OpNo == ZeroOpIdx && UnpackedBytes[i] == ZEXT_VAL)
+        continue;
+      if (OpNo != OpIdx || UnpackedBytes[i] == ZEXT_VAL)
+        return false;
+      Mask[UnpackedBytes[i]] = Byte;
+    }
+    for (unsigned i = 0; i < SystemZ::VectorBytes; ++i)
+      if (Mask[i] != -1 && Mask[i] != int(i)) {
+        NeedsPermute = true;
+        break;
+      }
+    SourceOp = Ops[OpIdx];
+    return true;
+  }
+};
+
+#ifndef NDEBUG
+static void dumpBytes(const SmallVectorImpl<int> &Bytes, std::string Msg) {
+  dbgs() << Msg.c_str() << " { ";
+  for (unsigned i = 0; i < Bytes.size(); i++) {
+    if (Bytes[i] == UnpackInfo::ZEXT_VAL)
+      dbgs() << "Z";
+    else
+      dbgs() << Bytes[i];
+    dbgs() << ", ";
+  }
+  dbgs() << "}\n";
+}
+#endif
+
 // Bytes is a VPERM-like permute vector, except that -1 is used for
 // undefined bytes.  Implement it on operands Ops[0] and Ops[1] using
 // VSLDB or VPERM.
@@ -4475,6 +4554,60 @@
   return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
 }
 
+// Detect cases where one of the source operands is a zero or undef vector.
+static SDValue tryShuffleWithZeroVec(SelectionDAG &DAG,
+                                     const SDLoc &DL, SDValue *Ops,
+                                     const SmallVectorImpl<int> &Bytes) {
+  // Try a single unpack but prefer VPERM over multiple unpacks.
+  unsigned ZeroOpIdx = findZeroOrUndefVectorIdx(Ops, 2);
+  if (ZeroOpIdx == UINT32_MAX)
+    return SDValue();
+  UnpackInfo UPI;
+  unsigned FromEltSize = 1;
+  for (; FromEltSize <= 4; FromEltSize *= 2)
+    if (UPI.tryApply(FromEltSize, Bytes, Ops, ZeroOpIdx))
+      break;
+  if (FromEltSize > 4)
+    return SDValue();
+
+  if (UPI.NeedsPermute) {
+    // If source vector is a PERMUTE with no other uses, modify it as needed.
+    if (UPI.SourceOp->getOpcode() != SystemZISD::PERMUTE ||
+        !UPI.SourceOp->use_empty())
+      return SDValue();
+    LLVM_DEBUG(dumpBytes(UPI.Mask, "Shuffling with mask before unpacking:"););
+    SDValue OrigMask = UPI.SourceOp->getOperand(2);
+    SDValue IndexNodes[SystemZ::VectorBytes];
+    for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
+      int MaskEl = UPI.Mask[I];
+      if (MaskEl < 0 || OrigMask.getOperand(MaskEl).isUndef())
+        IndexNodes[I] = DAG.getUNDEF(MVT::i32);
+      else {
+        auto *COp = cast<ConstantSDNode>(OrigMask.getOperand(MaskEl));
+        IndexNodes[I] = DAG.getConstant(COp->getZExtValue(), DL, MVT::i32);
+      }
+    }
+    SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
+    UPI.SourceOp = DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8,
+                               UPI.SourceOp.getOperand(0),
+                               UPI.SourceOp.getOperand(1), Op2);
+  }
+
+  // Build the unpack.
+  LLVM_DEBUG(dbgs() << "\nUnpacking with element size " << FromEltSize << ":\n";
+             Ops[0].dump();
+             Ops[1].dump();
+             dumpBytes(Bytes, "Given this 'Bytes' mask:"););
+  unsigned FromBits = FromEltSize * 8;
+  EVT FromVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
+                                SystemZ::VectorBits / FromBits);
+  SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, FromVT, UPI.SourceOp);
+  unsigned ToBits = FromBits * 2;
+  EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ToBits),
+                               SystemZ::VectorBits / ToBits);
+  return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp);
+}
+
 namespace {
 // Describes a general N-operand vector shuffle.
 struct GeneralShuffle {
@@ -4579,6 +4712,29 @@
   if (Ops.size() == 1)
     Ops.push_back(DAG.getUNDEF(MVT::v16i8));
 
+  // Put a zero vector last to attempt unpacking as the last operation if
+  // there are more than two operands.
+  uint32_t ZeroVecOpNo = UINT32_MAX;
+  if (Ops.size() > 2) {
+    ZeroVecOpNo = findZeroOrUndefVectorIdx(&Ops[0], Ops.size());
+    // Move the zero vector to the last position without rearranging the others.
+    unsigned LastOpNo = Ops.size() - 1;
+    if (ZeroVecOpNo != UINT32_MAX && ZeroVecOpNo != LastOpNo) {
+      SDValue ZeroOp = Ops[ZeroVecOpNo];
+      Ops.erase(&Ops[ZeroVecOpNo]);
+      Ops.push_back(ZeroOp);
+      for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
+        if (Bytes[I] >= 0) {
+          unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
+          if (OpNo > ZeroVecOpNo)
+            Bytes[I] -= SystemZ::VectorBytes;
+          else if (OpNo == ZeroVecOpNo)
+            Bytes[I] += SystemZ::VectorBytes * (LastOpNo - ZeroVecOpNo);
+        }
+      ZeroVecOpNo = LastOpNo;
+    }
+  }
+
   // Create a tree of shuffles, deferring root node until after the loop.
   // Try to redistribute the undefined elements of non-root nodes so that
   // the non-root shuffles match something like a pack or merge, then adjust
@@ -4589,8 +4745,14 @@
   // In the best case this redistribution will lead to the whole tree
   // using packs and merges.  It should rarely be a loss in other cases.
   unsigned Stride = 1;
-  for (; Stride * 2 < Ops.size(); Stride *= 2) {
+  // The Zero vector needs to be handled at last. Do this by skipping it
+  // below while doing one more iteration with the rest leaving only one
+  // other result.
+  unsigned ItersFactor = (ZeroVecOpNo == UINT32_MAX ? 2 : 1);
+  for (; Stride * ItersFactor < Ops.size(); Stride *= 2) {
     for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
+      if (I + Stride == ZeroVecOpNo)
+        continue;
       SDValue SubOps[] = { Ops[I], Ops[I + Stride] };
 
       // Create a mask for just these two operands.
@@ -4629,21 +4791,25 @@
   }
 
   // Now we just have 2 inputs.  Put the second operand in Ops[1].
-  if (Stride > 1) {
-    Ops[1] = Ops[Stride];
+  unsigned RHSOpNo = ZeroVecOpNo == UINT32_MAX ? Stride :ZeroVecOpNo;
+  if (RHSOpNo > 1) {
+    Ops[1] = Ops[RHSOpNo];
     for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
       if (Bytes[I] >= int(SystemZ::VectorBytes))
-        Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
+        Bytes[I] -= (RHSOpNo - 1) * SystemZ::VectorBytes;
   }
-
   // Look for an instruction that can do the permute without resorting
   // to VPERM.
   unsigned OpNo0, OpNo1;
   SDValue Op;
   if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
     Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
-  else
-    Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
+  else {
+    Op = tryShuffleWithZeroVec(DAG, DL, &Ops[0], Bytes);
+    if (!Op.getNode())
+      Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
+  }
+
   return DAG.getNode(ISD::BITCAST, DL, VT, Op);
 }
 
@@ -5041,9 +5207,8 @@
   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
 }
 
-SDValue
-SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
-                                              unsigned UnpackHigh) const {
+SDValue SystemZTargetLowering::
+lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
   SDValue PackedOp = Op.getOperand(0);
   EVT OutVT = Op.getValueType();
   EVT InVT = PackedOp.getValueType();
@@ -5053,11 +5218,39 @@
     FromBits *= 2;
     EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
                                  SystemZ::VectorBits / FromBits);
-    PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
+    PackedOp =
+      DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp);
   } while (FromBits != ToBits);
   return PackedOp;
 }
 
+// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector.
+SDValue SystemZTargetLowering::
+lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
+  SDValue PackedOp = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT OutVT = Op.getValueType();
+  EVT InVT = PackedOp.getValueType();
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned OutNumElts = OutVT.getVectorNumElements();
+  unsigned NumInPerOut = InNumElts / OutNumElts;
+
+  SDValue ZeroVec =
+    DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType()));
+
+  SmallVector<int, 16> Mask(InNumElts);
+  unsigned ZeroVecElt = InNumElts;
+  for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) {
+    unsigned MaskElt = PackedElt * NumInPerOut;
+    unsigned End = MaskElt + NumInPerOut - 1;
+    for (; MaskElt < End; MaskElt++)
+      Mask[MaskElt] = ZeroVecElt++;
+    Mask[MaskElt] = PackedElt;
+  }
+  SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask);
+  return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf);
+}
+
 SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
                                           unsigned ByScalar) const {
   // Look for cases where a vector shift can use the *_BY_SCALAR form.
@@ -5223,9 +5416,9 @@
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::SIGN_EXTEND_VECTOR_INREG:
-    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
+    return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG);
   case ISD::ZERO_EXTEND_VECTOR_INREG:
-    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
+    return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
   case ISD::SHL:
     return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
   case ISD::SRL:
Index: llvm/test/CodeGen/SystemZ/vec-move-16.ll
===================================================================
--- llvm/test/CodeGen/SystemZ/vec-move-16.ll
+++ llvm/test/CodeGen/SystemZ/vec-move-16.ll
@@ -40,9 +40,11 @@
 ; Test a v4i8->v4i32 extension.
 define <4 x i32> @f5(<4 x i8> *%ptr) {
 ; CHECK-LABEL: f5:
+; CHECK: larl	%r1, .LCPI4_0
 ; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
-; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
-; CHECK: vuplhh %v24, [[REG2]]
+; CHECK: vl	%v1, 0(%r1), 3
+; CHECK: vgbm	%v2, 0
+; CHECK: vperm	%v24, %v2, [[REG1]], %v1
 ; CHECK: br %r14
   %val = load <4 x i8>, <4 x i8> *%ptr
   %ret = zext <4 x i8> %val to <4 x i32>
@@ -71,10 +73,11 @@
 ; Test a v2i8->v2i64 extension.
 define <2 x i64> @f8(<2 x i8> *%ptr) {
 ; CHECK-LABEL: f8:
-; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2)
-; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
-; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]]
-; CHECK: vuplhf %v24, [[REG3]]
+; CHECK: larl	%r1, .LCPI7_0
+; CHECK: vlreph	[[REG1:%v[0-9]+]], 0(%r2)
+; CHECK: vl	%v1, 0(%r1), 3
+; CHECK: vgbm	%v2, 0
+; CHECK: vperm	%v24, %v2, [[REG1]], %v1
 ; CHECK: br %r14
   %val = load <2 x i8>, <2 x i8> *%ptr
   %ret = zext <2 x i8> %val to <2 x i64>
@@ -84,9 +87,11 @@
 ; Test a v2i16->v2i64 extension.
 define <2 x i64> @f9(<2 x i16> *%ptr) {
 ; CHECK-LABEL: f9:
-; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
-; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]]
-; CHECK: vuplhf %v24, [[REG2]]
+; CHECK: larl	%r1, .LCPI8_0
+; CHECK: vlrepf	[[REG1:%v[0-9]+]], 0(%r2)
+; CHECK: vl	%v1, 0(%r1), 3
+; CHECK: vgbm	%v2, 0
+; CHECK: vperm	%v24, %v2, [[REG1]], %v1
 ; CHECK: br %r14
   %val = load <2 x i16>, <2 x i16> *%ptr
   %ret = zext <2 x i16> %val to <2 x i64>
Index: llvm/test/CodeGen/SystemZ/vec-move-23.ll
===================================================================
--- llvm/test/CodeGen/SystemZ/vec-move-23.ll
+++ llvm/test/CodeGen/SystemZ/vec-move-23.ll
@@ -68,9 +68,10 @@
 
 define void @fun4(<2 x i8> %Src, <2 x double>* %Dst) {
 ; CHECK-LABEL: fun4:
-; CHECK:      vuplhb	%v0, %v24
-; CHECK-NEXT: vuplhh	%v0, %v0
-; CHECK-NEXT: vuplhf	%v0, %v0
+; CHECK:      larl	%r1, .LCPI4_0
+; CHECK-NEXT: vl	%v0, 0(%r1), 3
+; CHECK-NEXT: vgbm	%v1, 0
+; CHECK-NEXT: vperm	%v0, %v1, %v24, %v0
 ; CHECK-NEXT: vcdlgb	%v0, %v0, 0, 0
 ; CHECK-NEXT: vst	%v0, 0(%r2), 3
 ; CHECK-NEXT: br	%r14
@@ -81,8 +82,10 @@
 
 define void @fun5(<2 x i16> %Src, <2 x double>* %Dst) {
 ; CHECK-LABEL: fun5:
-; CHECK:      vuplhh	%v0, %v24
-; CHECK-NEXT: vuplhf	%v0, %v0
+; CHECK:      larl	%r1, .LCPI5_0
+; CHECK-NEXT: vl	%v0, 0(%r1), 3
+; CHECK-NEXT: vgbm	%v1, 0
+; CHECK-NEXT: vperm	%v0, %v1, %v24, %v0
 ; CHECK-NEXT: vcdlgb	%v0, %v0, 0, 0
 ; CHECK-NEXT: vst	%v0, 0(%r2), 3
 ; CHECK-NEXT: br	%r14
Index: llvm/test/CodeGen/SystemZ/vec-move-24.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/vec-move-24.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+;
+; Test that vperm is not used if a single unpack is enough.
+
+define <4 x i32> @fun0(<4 x i32>* %Src) nounwind {
+; CHECK-LABEL: fun0:
+; CHECK-NOT: vperm
+  %tmp = load <4 x i32>, <4 x i32>* %Src
+  %tmp2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> <i32 0, i32 4, i32 2, i32 5>
+  ret <4 x i32> %tmp2
+}
+
+define  void @fun1(i8 %Src, <32 x i8>* %Dst) nounwind {
+; CHECK-LABEL: fun1:
+; CHECK-NOT: vperm
+  %I0 = insertelement <16 x i8> undef, i8 %Src, i32 0
+  %I1 = insertelement <16 x i8> %I0, i8 %Src, i32 1
+  %I2 = insertelement <16 x i8> %I1, i8 %Src, i32 2
+  %I3 = insertelement <16 x i8> %I2, i8 %Src, i32 3
+  %I4 = insertelement <16 x i8> %I3, i8 %Src, i32 4
+  %I5 = insertelement <16 x i8> %I4, i8 %Src, i32 5
+  %I6 = insertelement <16 x i8> %I5, i8 %Src, i32 6
+  %I7 = insertelement <16 x i8> %I6, i8 %Src, i32 7
+  %I8 = insertelement <16 x i8> %I7, i8 %Src, i32 8
+  %I9 = insertelement <16 x i8> %I8, i8 %Src, i32 9
+  %I10 = insertelement <16 x i8> %I9, i8 %Src, i32 10
+  %I11 = insertelement <16 x i8> %I10, i8 %Src, i32 11
+  %I12 = insertelement <16 x i8> %I11, i8 %Src, i32 12
+  %I13 = insertelement <16 x i8> %I12, i8 %Src, i32 13
+  %I14 = insertelement <16 x i8> %I13, i8 %Src, i32 14
+  %I15 = insertelement <16 x i8> %I14, i8 %Src, i32 15
+
+  %tmp = shufflevector <16 x i8> zeroinitializer,
+                       <16 x i8> %I15,
+                       <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                   i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                   i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                   i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tmp9 = shufflevector <32 x i8> undef,
+                        <32 x i8> %tmp,
+                        <32 x i32> <i32 33, i32 32, i32 48, i32 49, i32 1, i32 17, i32 50, i32 51,
+                                    i32 2, i32 18, i32 52, i32 53, i32 3, i32 19, i32 54, i32 55,
+                                    i32 4, i32 20, i32 56, i32 57, i32 5, i32 21, i32 58, i32 59,
+                                    i32 6, i32 22, i32 60, i32 61, i32 7, i32 62, i32 55, i32 63>
+
+  store <32 x i8> %tmp9, <32 x i8>* %Dst
+  ret void
+}
+
Index: llvm/test/CodeGen/SystemZ/vec-zext.ll
===================================================================
--- llvm/test/CodeGen/SystemZ/vec-zext.ll
+++ llvm/test/CodeGen/SystemZ/vec-zext.ll
@@ -1,5 +1,5 @@
-; Test that vector zexts are done efficently with unpack instructions also in
-; case of fewer elements than allowed, e.g. <2 x i32>.
+; Test that vector zexts are done efficently also in case of fewer elements
+; than allowed, e.g. <2 x i32>.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
@@ -14,8 +14,10 @@
 
 define <2 x i32> @fun2(<2 x i8> %val1) {
 ; CHECK-LABEL: fun2:
-; CHECK:      	vuplhb	%v0, %v24
-; CHECK-NEXT: 	vuplhh	%v24, %v0
+; CHECK:        larl	%r1, .LCPI1_0
+; CHECK-NEXT:   vl	%v0, 0(%r1), 3
+; CHECK-NEXT:   vgbm	%v1, 0
+; CHECK-NEXT:   vperm	%v24, %v1, %v24, %v0
 ; CHECK-NEXT: 	br	%r14
   %z = zext <2 x i8> %val1 to <2 x i32>
   ret <2 x i32> %z
@@ -23,9 +25,10 @@
 
 define <2 x i64> @fun3(<2 x i8> %val1) {
 ; CHECK-LABEL: fun3:
-; CHECK:      	vuplhb	%v0, %v24
-; CHECK-NEXT: 	vuplhh	%v0, %v0
-; CHECK-NEXT: 	vuplhf	%v24, %v0
+; CHECK: 	larl	%r1, .LCPI2_0
+; CHECK-NEXT: 	vl	%v0, 0(%r1), 3
+; CHECK-NEXT: 	vgbm	%v1, 0
+; CHECK-NEXT: 	vperm	%v24, %v1, %v24, %v0
 ; CHECK-NEXT: 	br	%r14
   %z = zext <2 x i8> %val1 to <2 x i64>
   ret <2 x i64> %z
@@ -41,8 +44,10 @@
 
 define <2 x i64> @fun5(<2 x i16> %val1) {
 ; CHECK-LABEL: fun5:
-; CHECK:      	vuplhh	%v0, %v24
-; CHECK-NEXT: 	vuplhf	%v24, %v0
+; CHECK: 	larl	%r1, .LCPI4_0
+; CHECK-NEXT: 	vl	%v0, 0(%r1), 3
+; CHECK-NEXT: 	vgbm	%v1, 0
+; CHECK-NEXT: 	vperm	%v24, %v1, %v24, %v0
 ; CHECK-NEXT: 	br	%r14
   %z = zext <2 x i16> %val1 to <2 x i64>
   ret <2 x i64> %z
@@ -66,8 +71,10 @@
 
 define <4 x i32> @fun8(<4 x i8> %val1) {
 ; CHECK-LABEL: fun8:
-; CHECK:      	vuplhb	%v0, %v24
-; CHECK-NEXT: 	vuplhh	%v24, %v0
+; CHECK: 	larl	%r1, .LCPI7_0
+; CHECK-NEXT: 	vl	%v0, 0(%r1), 3
+; CHECK-NEXT: 	vgbm	%v1, 0
+; CHECK-NEXT: 	vperm	%v24, %v1, %v24, %v0
 ; CHECK-NEXT: 	br	%r14
   %z = zext <4 x i8> %val1 to <4 x i32>
   ret <4 x i32> %z