Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -363,6 +363,9 @@
   // Element swapping load/store.  Same operands as regular load/store.
   VLER, VSTER,
 
+  // Zero all bits of vector and load logical element.
+  VLLEZ,
+
   // Prefetch from the second operand using the 4-bit control code in
   // the first operand.  The code is 1 for a load prefetch and 2 for
   // a store prefetch.
@@ -620,6 +623,7 @@
   SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue tryVLLEZ(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
                                  unsigned UnpackHigh) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
@@ -642,6 +646,7 @@
   SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineFP_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineUINT_TO_FP(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -641,6 +641,7 @@
   setTargetDAGCombine(ISD::FP_ROUND);
   setTargetDAGCombine(ISD::STRICT_FP_ROUND);
   setTargetDAGCombine(ISD::FP_EXTEND);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
   setTargetDAGCombine(ISD::BSWAP);
   setTargetDAGCombine(ISD::SDIV);
@@ -5039,8 +5040,89 @@
 }
 
 SDValue
+SystemZTargetLowering::tryVLLEZ(SDValue Op, SelectionDAG &DAG) const {
+  // Replace ZERO_EXTEND_VECTOR_INREG -> VECTOR_SHUFFLE -> LOAD
+  //   with
+  // VLE -> VLLEZ
+
+  auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0));
+  if (!SVN || !SVN->getOperand(1).isUndef())
+    return SDValue();
+
+  // Only generate one additional load (VLE).
+  EVT OutVT = Op.getValueType();
+  EVT InVT = SVN->getValueType(0);
+  if (OutVT != MVT::v2i64 || (InVT != MVT::v8i16 && InVT != MVT::v16i8))
+    return SDValue();
+
+  // Find the load by looking through any type conversions.
+  SDValue Src = SVN->getOperand(0);
+  bool Change = true;
+  while (Change) {
+    Change = false;
+    switch (Src->getOpcode()) {
+    case ISD::BITCAST:
+      if (!Src->getOperand(0).getValueType().isVector())
+        break;
+      LLVM_FALLTHROUGH;
+    case ISD::SCALAR_TO_VECTOR:
+      Src = Src->getOperand(0);
+      Change = true;
+      break;
+    default: break;
+    }
+  }
+  auto *Load = dyn_cast<LoadSDNode>(Src);
+  if (!Load || Load->isVolatile())
+    return SDValue();
+
+  // First do the VLLEZ, which will zero all other bits of the vector.
+  EVT NarrowEltVT = InVT.getScalarType();
+  unsigned NarrowEltBytes = NarrowEltVT.getSizeInBits() / 8;
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Load);
+  const SDValue &BaseAddr = Load->getBasePtr();
+  unsigned ByteOffset = SVN->getMaskElt(0) * NarrowEltBytes;
+  SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr,
+                                DAG.getIntPtrConstant(ByteOffset, DL));
+  SDVTList Tys = DAG.getVTList(InVT, MVT::Other);
+  SDValue Ops[] = { Load->getChain(), Address };
+  MachineMemOperand *MMO = Load->getMemOperand();
+  SDValue VLLEZ = DAG.getMemIntrinsicNode(SystemZISD::VLLEZ, DL, Tys, Ops,
+                                          NarrowEltVT, MMO);
+  // Load the other element.
+  ByteOffset = SVN->getMaskElt(1) * NarrowEltBytes;
+  Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr,
+                        DAG.getIntPtrConstant(ByteOffset, DL));
+  SDValue EltLd = DAG.getLoad(NarrowEltVT, DL, VLLEZ.getValue(1), Address, MMO);
+  SDValue InsIdx = DAG.getVectorIdxConstant(InVT.getVectorNumElements() - 1, DL);
+  SDValue InsVec =
+    DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InVT, VLLEZ, EltLd, InsIdx);
+
+  // Update chains.
+  SDValue LoadCh = SDValue(Load, 1);
+  if (!LoadCh.use_empty()) {
+    SDValue TF = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                             VLLEZ.getValue(1), EltLd.getValue(1));
+    DAG.ReplaceAllUsesOfValueWith(LoadCh, TF);
+    SmallVector<SDValue, 2> Ops;
+    Ops.push_back(LoadCh);
+    Ops.push_back(EltLd.getValue(1));
+    DAG.UpdateNodeOperands(TF.getNode(), Ops);
+  }
+
+  return DAG.getNode(ISD::BITCAST, DL, OutVT, InsVec);
+}
+
+SDValue
 SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
                                               unsigned UnpackHigh) const {
+  if (UnpackHigh == SystemZISD::UNPACKL_HIGH) {
+    SDValue Res = tryVLLEZ(Op, DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
   SDValue PackedOp = Op.getOperand(0);
   EVT OutVT = Op.getValueType();
   EVT InVT = PackedOp.getValueType();
@@ -5447,6 +5529,7 @@
     OPCODE(STRV);
     OPCODE(VLER);
     OPCODE(VSTER);
+    OPCODE(VLLEZ);
     OPCODE(PREFETCH);
   }
   return nullptr;
@@ -6081,6 +6164,29 @@
   return SDValue();
 }
 
+SDValue SystemZTargetLowering::combineUINT_TO_FP(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  if (DCI.Level != BeforeLegalizeTypes)
+    return SDValue();
+  EVT OutVT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op = N->getOperand(0);
+  unsigned OutScalarBits = OutVT.getScalarSizeInBits();
+  unsigned InScalarBits = Op->getValueType(0).getScalarSizeInBits();
+
+  // Insert a zero_extend before type-legalization to avoid scalarization, e.g.:
+  // v2f64 = uint_to_fp v2i16
+  // =>
+  // v2f64 = uint_to_fp (v2i64 zero_extend v2i16)
+  if (OutVT.isVector() && OutScalarBits > InScalarBits) {
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(OutVT.getScalarSizeInBits()),
+                                 OutVT.getVectorNumElements());
+    SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), ExtVT, Op);
+    return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), OutVT, ExtOp);
+  }
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::combineBSWAP(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -6408,6 +6514,7 @@
   case ISD::FP_ROUND:           return combineFP_ROUND(N, DCI);
   case ISD::STRICT_FP_EXTEND:
   case ISD::FP_EXTEND:          return combineFP_EXTEND(N, DCI);
+  case ISD::UINT_TO_FP:         return combineUINT_TO_FP(N, DCI);
   case ISD::BSWAP:              return combineBSWAP(N, DCI);
   case SystemZISD::BR_CCMASK:   return combineBR_CCMASK(N, DCI);
   case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
Index: llvm/lib/Target/SystemZ/SystemZOperators.td
===================================================================
--- llvm/lib/Target/SystemZ/SystemZOperators.td
+++ llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -292,6 +292,8 @@
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def z_storeeswap       : SDNode<"SystemZISD::VSTER", SDTStore,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def z_vllez            : SDNode<"SystemZISD::VLLEZ", SDTLoad,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
 def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest>;
 
@@ -811,17 +813,25 @@
 
 // Load a scalar and insert it into the low element of the high i64 of a
 // zeroed vector.
-class z_vllez<ValueType scalartype, SDPatternOperator load, int index>
+class vllez_insertpat<ValueType scalartype, SDPatternOperator load, int index>
   : PatFrag<(ops node:$addr),
             (z_vector_insert immAllZerosV,
                              (scalartype (load node:$addr)), (i32 index))>;
-def z_vllezi8  : z_vllez<i32, anyextloadi8, 7>;
-def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>;
-def z_vllezi32 : z_vllez<i32, load, 1>;
+
+class vllez_patterns<ValueType scalartype, SDPatternOperator load, int index>
+  : PatFrags<(ops node:$addr),
+             [(z_vector_insert immAllZerosV,
+                               (scalartype (load node:$addr)), (i32 index)),
+              (z_vllez node:$addr)]>;
+
+def z_vllezi8  : vllez_patterns<i32, anyextloadi8, 7>;
+def z_vllezi16 : vllez_patterns<i32, anyextloadi16, 3>;
+def z_vllezi32 : vllez_patterns<i32, load, 1>;
 def z_vllezi64 : PatFrags<(ops node:$addr),
                           [(z_vector_insert immAllZerosV,
                                             (i64 (load node:$addr)), (i32 0)),
-                           (z_join_dwords (i64 (load node:$addr)), (i64 0))]>;
+                           (z_join_dwords (i64 (load node:$addr)), (i64 0)),
+                           (z_vllez node:$addr)]>;
 // We use high merges to form a v4f32 from four f32s.  Propagating zero
 // into all elements but index 1 gives this expression.
 def z_vllezf32 : PatFrag<(ops node:$addr),
@@ -840,7 +850,7 @@
                           immAllZerosV)>;
 
 // Similarly for the high element of a zeroed vector.
-def z_vllezli32 : z_vllez<i32, load, 0>;
+def z_vllezli32 : vllez_insertpat<i32, load, 0>;
 def z_vllezlf32 : PatFrag<(ops node:$addr),
                           (z_merge_high
                            (v2i64
@@ -853,9 +863,9 @@
                             (bitconvert (v4f32 immAllZerosV))))>;
 
 // Byte-swapped variants.
-def z_vllebrzi16  : z_vllez<i32, z_loadbswap16, 3>;
-def z_vllebrzi32  : z_vllez<i32, z_loadbswap32, 1>;
-def z_vllebrzli32 : z_vllez<i32, z_loadbswap32, 0>;
+def z_vllebrzi16  : vllez_insertpat<i32, z_loadbswap16, 3>;
+def z_vllebrzi32  : vllez_insertpat<i32, z_loadbswap32, 1>;
+def z_vllebrzli32 : vllez_insertpat<i32, z_loadbswap32, 0>;
 def z_vllebrzi64  : PatFrags<(ops node:$addr),
                              [(z_vector_insert immAllZerosV,
                                                (i64 (z_loadbswap64 node:$addr)),
Index: llvm/test/CodeGen/SystemZ/vec-move-23.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/vec-move-23.ll
@@ -0,0 +1,140 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s -check-prefixes=CHECK,Z14
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s -check-prefixes=CHECK,Z15
+;
+; Check that uitofp conversions from a narrower type get a vector zero extend.
+;
+; Also test that shuffled and zero extended vector loads gets implemented
+; with vllez + vle in the case of a <2 x i64> result.
+
+define void @fun1(<2 x i16>* %Src, <2 x double>* %Dst) {
+; CHECK-LABEL: fun1:
+; CHECK:      vlrepf	%v0, 0(%r2)
+; CHECK-NEXT: vuplhh	%v0, %v0
+; CHECK-NEXT: vuplhf	%v0, %v0
+; CHECK-NEXT: vcdlgb	%v0, %v0, 0, 0
+; CHECK-NEXT: vst	%v0, 0(%r3), 3
+; CHECK-NEXT: br	%r14
+  %l = load <2 x i16>, <2 x i16>* %Src
+  %c = uitofp <2 x i16> %l to <2 x double>
+  store <2 x double> %c, <2 x double>* %Dst
+  ret void
+}
+
+define void @fun2(<2 x i16>* %Src, <2 x double>* %Dst) {
+; CHECK-LABEL: fun2:
+; CHECK:      vllezh	%v0, 2(%r2)
+; CHECK-NEXT: vleh	%v0, 0(%r2), 7
+; CHECK-NEXT: vcdlgb	%v0, %v0, 0, 0
+; CHECK-NEXT: vst	%v0, 0(%r3), 3
+; CHECK-NEXT: br	%r14
+  %l = load <2 x i16>, <2 x i16>* %Src
+  %sh = shufflevector <2 x i16> %l, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+  %c = uitofp <2 x i16> %sh to <2 x double>
+  store <2 x double> %c, <2 x double>* %Dst
+  ret void
+}
+
+define void @fun3(<4 x i16>* %Src, <4 x double>* %Dst) {
+; CHECK-LABEL: fun3:
+; CHECK:      vllezh	%v0, 4(%r2)
+; CHECK-NEXT: vleh	%v0, 2(%r2), 7
+; CHECK-NEXT: vllezh	%v1, 6(%r2)
+; CHECK-NEXT: vleh	%v1, 0(%r2), 7
+; CHECK-NEXT: vcdlgb	%v1, %v1, 0, 0
+; CHECK-NEXT: vcdlgb	%v0, %v0, 0, 0
+; CHECK-NEXT: vst	%v0, 16(%r3), 4
+; CHECK-NEXT: vst	%v1, 0(%r3), 4
+; CHECK-NEXT: br	%r14
+  %l = load <4 x i16>, <4 x i16>* %Src
+  %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  %c = uitofp <4 x i16> %sh to <4 x double>
+  store <4 x double> %c, <4 x double>* %Dst
+  ret void
+}
+
+define void @fun4(<4 x i8>* %Src, <4 x double>* %Dst) {
+; CHECK-LABEL: fun4:
+; CHECK:      vllezb	%v0, 2(%r2)
+; CHECK-NEXT: vleb	%v0, 1(%r2), 15
+; CHECK-NEXT: vllezb	%v1, 3(%r2)
+; CHECK-NEXT: vleb	%v1, 0(%r2), 15
+; CHECK-NEXT: vcdlgb	%v1, %v1, 0, 0
+; CHECK-NEXT: vcdlgb	%v0, %v0, 0, 0
+; CHECK-NEXT: vst	%v0, 16(%r3), 4
+; CHECK-NEXT: vst	%v1, 0(%r3), 4
+; CHECK-NEXT: br	%r14
+  %l = load <4 x i8>, <4 x i8>* %Src
+  %sh = shufflevector <4 x i8> %l, <4 x i8> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  %c = uitofp <4 x i8> %sh to <4 x double>
+  store <4 x double> %c, <4 x double>* %Dst
+  ret void
+}
+
+define void @fun5(<4 x i16>* %Src, <4 x float>* %Dst) {
+; CHECK-LABEL: fun5:
+; Z14:      larl	%r1, .LCPI4_0
+; Z14-NEXT: vlrepg	%v0, 0(%r2)
+; Z14-NEXT: vl	        %v1, 0(%r1), 3
+; Z14-NEXT: vperm	%v0, %v0, %v0, %v1
+; Z14-NEXT: vuplhh	%v0, %v0
+; Z14-NEXT: vlgvf	%r0, %v0, 3
+; Z14-NEXT: celfbr	%f1, 0, %r0, 0
+; Z14-NEXT: vlgvf	%r0, %v0, 2
+; Z14-NEXT: celfbr	%f2, 0, %r0, 0
+; Z14-NEXT: vlgvf	%r0, %v0, 1
+; Z14-NEXT: vmrhf	%v1, %v2, %v1
+; Z14-NEXT: celfbr	%f2, 0, %r0, 0
+; Z14-NEXT: vlgvf	%r0, %v0, 0
+; Z14-NEXT: celfbr	%f0, 0, %r0, 0
+; Z14-NEXT: vmrhf	%v0, %v0, %v2
+; Z14-NEXT: vmrhg	%v0, %v0, %v1
+; Z14-NEXT: vst	        %v0, 0(%r3), 3
+; Z14-NEXT: br	        %r14
+
+; Z15:      larl	%r1, .LCPI4_0
+; Z15-NEXT: vlrepg	%v0, 0(%r2)
+; Z15-NEXT: vl	        %v1, 0(%r1), 3
+; Z15-NEXT: vperm	%v0, %v0, %v0, %v1
+; Z15-NEXT: vuplhh	%v0, %v0
+; Z15-NEXT: vcelfb	%v0, %v0, 0, 0
+; Z15-NEXT: vst	        %v0, 0(%r3), 3
+; Z15-NEXT: br	        %r14
+  %l = load <4 x i16>, <4 x i16>* %Src
+  %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  %c = uitofp <4 x i16> %sh to <4 x float>
+  store <4 x float> %c, <4 x float>* %Dst
+  ret void
+}
+
+define void @fun6(<4 x i16>* %Src, <4 x i64>* %Dst) {
+; CHECK-LABEL: fun6:
+; CHECK:       vllezh	%v0, 6(%r2)
+; CHECK-LABEL: vleh	%v0, 0(%r2), 7
+; CHECK-LABEL: vllezh	%v1, 4(%r2)
+; CHECK-LABEL: vleh	%v1, 2(%r2), 7
+; CHECK-LABEL: vst	%v1, 16(%r3), 4
+; CHECK-LABEL: vst	%v0, 0(%r3), 4
+; CHECK-LABEL: br	%r14
+  %l = load <4 x i16>, <4 x i16>* %Src
+  %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  %z = zext <4 x i16> %sh to <4 x i64>
+  store <4 x i64> %z, <4 x i64>* %Dst
+  ret void
+}
+
+; Don't use vllez and multiple vle:s.
+define void @fun7(<4 x i16>* %Src, <4 x i32>* %Dst) {
+; CHECK-LABEL: fun7:
+; CHECK:       larl	%r1, .LCPI6_0
+; CHECK-LABEL: vlrepg	%v0, 0(%r2)
+; CHECK-LABEL: vl	%v1, 0(%r1), 3
+; CHECK-LABEL: vperm	%v0, %v0, %v0, %v1
+; CHECK-LABEL: vuplhh	%v0, %v0
+; CHECK-LABEL: vst	%v0, 0(%r3), 3
+; CHECK-LABEL: br	%r14
+  %l = load <4 x i16>, <4 x i16>* %Src
+  %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  %z = zext <4 x i16> %sh to <4 x i32>
+  store <4 x i32> %z, <4 x i32>* %Dst
+  ret void
+}