Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -620,6 +620,7 @@
   SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue tryVLLEZ(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
                                  unsigned UnpackHigh) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -5041,8 +5041,90 @@
 }
 
 SDValue
+SystemZTargetLowering::tryVLLEZ(SDValue Op, SelectionDAG &DAG) const {
+  // Replace ZERO_EXTEND_VECTOR_INREG -> VECTOR_SHUFFLE -> LOAD
+  //   with
+  // VLE -> VLLEZ
+
+  auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0));
+  if (!SVN || !SVN->getOperand(1).isUndef())
+    return SDValue();
+
+  // Only generate one additional load (VLE).
+  EVT OutVT = Op.getValueType();
+  EVT InVT = SVN->getValueType(0);
+  if (OutVT != MVT::v2i64 || (InVT != MVT::v8i16 && InVT != MVT::v16i8))
+    return SDValue();
+
+  // Find the load by looking through any type conversions.
+  SDValue Src = SVN->getOperand(0);
+  bool Change = true;
+  while (Change) {
+    Change = false;
+    switch (Src->getOpcode()) {
+    case ISD::BITCAST:
+      if (!Src->getOperand(0).getValueType().isVector())
+        break;
+      LLVM_FALLTHROUGH;
+    case ISD::SCALAR_TO_VECTOR:
+      Src = Src->getOperand(0);
+      Change = true;
+      break;
+    default: break;
+    }
+  }
+  auto *Load = dyn_cast<LoadSDNode>(Src);
+  if (!Load || Load->isVolatile())
+    return SDValue();
+
+  // Load the first element and insert it into a zero vector (-> VLLEZ).
+  EVT NarrowEltVT = InVT.getScalarType();
+  unsigned NarrowEltBytes = NarrowEltVT.getSizeInBits() / 8;
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Load);
+  const SDValue &BaseAddr = Load->getBasePtr();
+  unsigned ByteOffset = SVN->getMaskElt(0) * NarrowEltBytes;
+  SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr,
+                                DAG.getIntPtrConstant(ByteOffset, DL));
+  MachineMemOperand *MMO = Load->getMemOperand();
+  SDValue Ld0 = DAG.getLoad(NarrowEltVT, DL, Load->getChain(), Address, MMO);
+  SDValue Zeroes = DAG.getSplatBuildVector(InVT, DL,
+                                           DAG.getConstant(0, DL, MVT::i32));
+  SDValue Ins0 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InVT, Zeroes, Ld0,
+            DAG.getVectorIdxConstant(InVT.getVectorNumElements() / 2 - 1, DL));
+
+  // Load the other element (-> VLE).
+  ByteOffset = SVN->getMaskElt(1) * NarrowEltBytes;
+  Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr,
+                        DAG.getIntPtrConstant(ByteOffset, DL));
+  SDValue Ld1 = DAG.getLoad(NarrowEltVT, DL, Ld0.getValue(1), Address, MMO);
+  SDValue Ins1 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InVT, Ins0, Ld1,
+            DAG.getVectorIdxConstant(InVT.getVectorNumElements() - 1, DL));
+
+  // Update chains.
+  SDValue LoadCh = SDValue(Load, 1);
+  if (!LoadCh.use_empty()) {
+    SDValue TF = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                             Ld0.getValue(1), Ld1.getValue(1));
+    DAG.ReplaceAllUsesOfValueWith(LoadCh, TF);
+    SmallVector<SDValue, 2> Ops;
+    Ops.push_back(LoadCh);
+    Ops.push_back(Ld1.getValue(1));
+    DAG.UpdateNodeOperands(TF.getNode(), Ops);
+  }
+
+  return DAG.getNode(ISD::BITCAST, DL, OutVT, Ins1);
+}
+
+SDValue
 SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
                                               unsigned UnpackHigh) const {
+  if (UnpackHigh == SystemZISD::UNPACKL_HIGH) {
+    SDValue Res = tryVLLEZ(Op, DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
   SDValue PackedOp = Op.getOperand(0);
   EVT OutVT = Op.getValueType();
   EVT InVT = PackedOp.getValueType();
Index: llvm/test/CodeGen/SystemZ/vec-move-24.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/vec-move-24.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+;
+; Test that shuffled and zero extended vector loads gets implemented
+; with vllez + vle in the case of a <2 x i64> result.
+
+define void @fun0(<2 x i8>* %Src, <2 x i64>* %Dst) {
+; CHECK-LABEL: fun0:
+; CHECK:      vllezb	%v0, 1(%r2)
+; CHECK-NEXT: vleb	%v0, 0(%r2), 15
+; CHECK-NEXT: vst	%v0, 0(%r3), 3
+; CHECK-NEXT: br	%r14
+  %l = load <2 x i8>, <2 x i8>* %Src
+  %sh = shufflevector <2 x i8> %l, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+  %z = zext <2 x i8> %sh to <2 x i64>
+  store <2 x i64> %z, <2 x i64>* %Dst
+  ret void
+}
+
+define void @fun1(<2 x i16>* %Src, <2 x i64>* %Dst) {
+; CHECK-LABEL: fun1:
+; CHECK:      vllezh	%v0, 2(%r2)
+; CHECK-NEXT: vleh	%v0, 0(%r2), 7
+; CHECK-NEXT: vst	%v0, 0(%r3), 3
+; CHECK-NEXT: br	%r14
+  %l = load <2 x i16>, <2 x i16>* %Src
+  %sh = shufflevector <2 x i16> %l, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+  %z = zext <2 x i16> %sh to <2 x i64>
+  store <2 x i64> %z, <2 x i64>* %Dst
+  ret void
+}
+
+; Don't use vllez and multiple vle:s.
+define void @fun2(<4 x i16>* %Src, <4 x i32>* %Dst) {
+; CHECK-LABEL: fun2:
+; CHECK:      larl	%r1, .LCPI2_0
+; CHECK-NEXT: vlrepg	%v0, 0(%r2)
+; CHECK-NEXT: vl	%v1, 0(%r1), 3
+; CHECK-NEXT: vperm	%v0, %v0, %v0, %v1
+; CHECK-NEXT: vuplhh	%v0, %v0
+; CHECK-NEXT: vst	%v0, 0(%r3), 3
+; CHECK-NEXT: br	%r14
+  %l = load <4 x i16>, <4 x i16>* %Src
+  %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %z = zext <4 x i16> %sh to <4 x i32>
+  store <4 x i32> %z, <4 x i32>* %Dst
+  ret void
+}