Index: llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -304,7 +304,7 @@
                            uint64_t UpperVal, uint64_t LowerVal);
 
   void loadVectorConstant(const SystemZVectorConstantInfo &VCI,
-                          SDNode *Node);
+                          SDNode *Node, EVT VT);
 
   // Try to use gather instruction Opcode to implement vector insertion N.
   bool tryGather(SDNode *N, unsigned Opcode);
@@ -1147,13 +1147,12 @@
 }
 
 void SystemZDAGToDAGISel::loadVectorConstant(
-    const SystemZVectorConstantInfo &VCI, SDNode *Node) {
+    const SystemZVectorConstantInfo &VCI, SDNode *Node, EVT VT) {
   assert((VCI.Opcode == SystemZISD::BYTE_MASK ||
           VCI.Opcode == SystemZISD::REPLICATE ||
           VCI.Opcode == SystemZISD::ROTATE_MASK) &&
          "Bad opcode!");
   assert(VCI.VecVT.getSizeInBits() == 128 && "Expected a vector type");
-  EVT VT = Node->getValueType(0);
   SDLoc DL(Node);
   SmallVector<SDValue, 2> Ops;
   for (unsigned OpVal : VCI.OpVals)
@@ -1166,11 +1165,20 @@
     SDValue BitCast = CurDAG->getNode(ISD::BITCAST, DL, VT, Op);
     ReplaceNode(Node, BitCast.getNode());
     SelectCode(BitCast.getNode());
-  } else { // float or double
+  } else if (VT.isFloatingPoint()) {
     unsigned SubRegIdx =
         (VT.getSizeInBits() == 32 ? SystemZ::subreg_h32 : SystemZ::subreg_h64);
     ReplaceNode(
         Node, CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, Op).getNode());
+  } else {
+    unsigned NumBytes = VT.getStoreSize();
+    assert((NumBytes == 4 || NumBytes == 8) && "Unexpected vector element size");
+    EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), VT,
+                                 SystemZ::VectorBytes / NumBytes);
+    SDValue BitCast = CurDAG->getBitcast(VecVT, Op);
+    SDValue ValueToUse = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                                 BitCast, CurDAG->getVectorIdxConstant(0, DL));
+    ReplaceNode(Node, ValueToUse.getNode());
   }
   SelectCode(Op.getNode());
 }
@@ -1503,6 +1511,9 @@
   return true;
 }
 
+// EXPERIMENTAL
+static cl::opt<bool> REPLICATE_ONLY("replicate-only", cl::init(false), cl::Hidden);
+
 void SystemZDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
@@ -1634,7 +1645,7 @@
     auto *BVN = cast<BuildVectorSDNode>(Node);
     SystemZVectorConstantInfo VCI(BVN);
     if (VCI.isVectorConstantLegal(*Subtarget)) {
-      loadVectorConstant(VCI, Node);
+      loadVectorConstant(VCI, Node, Node->getValueType(0));
       return;
     }
     break;
@@ -1647,7 +1658,7 @@
     SystemZVectorConstantInfo VCI(Imm);
     bool Success = VCI.isVectorConstantLegal(*Subtarget); (void)Success;
     assert(Success && "Expected legal FP immediate");
-    loadVectorConstant(VCI, Node);
+    loadVectorConstant(VCI, Node, Node->getValueType(0));
     return;
   }
 
@@ -1655,6 +1666,7 @@
     if (tryFoldLoadStoreIntoMemOperand(Node))
       return;
     auto *Store = cast<StoreSDNode>(Node);
+    auto &Op1 = Node->getOperand(1);
     unsigned ElemBitSize = Store->getValue().getValueSizeInBits();
     if (ElemBitSize == 32) {
       if (tryScatter(Store, SystemZ::VSCEF))
@@ -1663,6 +1675,31 @@
       if (tryScatter(Store, SystemZ::VSCEG))
         return;
     }
+    if (auto *C = dyn_cast<ConstantSDNode>(Op1)) {
+      EVT MemVT = Store->getMemoryVT();
+      unsigned NumMemBytes = MemVT.getStoreSize();
+      if (C->getAPIntValue().getBitWidth() <= 64 &&
+          !isInt<16>(C->getSExtValue()) && !C->isAllOnes() && NumMemBytes > 2) {
+        SmallVector<SDNode *, 4> Stores;
+        for (auto *U : C->uses())
+          if (StoreSDNode *ST = dyn_cast<StoreSDNode>(U))
+            Stores.push_back(ST);
+        if (Stores.size() == C->use_size()) {
+          SystemZVectorConstantInfo VCI(C->getZExtValue(), NumMemBytes * 8);
+          if (VCI.isVectorConstantLegal(*Subtarget) &&
+              (VCI.Opcode == SystemZISD::REPLICATE || !REPLICATE_ONLY)) {
+            loadVectorConstant(VCI, Op1.getNode(), MemVT);
+            // Need to select all stores into VSTE before the bitcast is removed.
+            for (auto *STNode : Stores)
+              SelectCode(STNode);
+            auto &StoredVal = Node->getOperand(0);
+            if (StoredVal->getOpcode() == ISD::BITCAST)
+              SelectCode(StoredVal.getNode());
+            return;
+          }
+        }
+      }
+    }
     break;
   }
   }
Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -753,12 +753,13 @@
   APInt SplatUndef;          // Bits correspoding to undef operands of the BVN.
   unsigned SplatBitSize = 0;
   bool isFP128 = false;
-
+  void findSplat();
 public:
   unsigned Opcode = 0;
   SmallVector<unsigned, 2> OpVals;
   MVT VecVT;
   SystemZVectorConstantInfo(APFloat FPImm);
+  SystemZVectorConstantInfo(uint64_t Imm, unsigned WordBits);
   SystemZVectorConstantInfo(BuildVectorSDNode *BVN);
   bool isVectorConstantLegal(const SystemZSubtarget &Subtarget);
 };
Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -790,14 +790,9 @@
   return tryValue(SplatBitsZ | Middle);
 }
 
-SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
-  IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
-  isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
-  SplatBits = FPImm.bitcastToAPInt();
-  unsigned Width = SplatBits.getBitWidth();
-  IntBits <<= (SystemZ::VectorBits - Width);
-
+void SystemZVectorConstantInfo::findSplat()  {
   // Find the smallest splat.
+  unsigned Width = SplatBits.getBitWidth();
   while (Width > 8) {
     unsigned HalfSize = Width / 2;
     APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
@@ -814,6 +809,22 @@
   SplatBitSize = Width;
 }
 
+SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
+  IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
+  isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
+  SplatBits = FPImm.bitcastToAPInt();
+  IntBits <<= (SystemZ::VectorBits - SplatBits.getBitWidth());
+  findSplat();
+}
+
+SystemZVectorConstantInfo::SystemZVectorConstantInfo(uint64_t Imm,
+                                                     unsigned WordBits) {
+  IntBits = APInt(128, Imm);
+  IntBits <<= (SystemZ::VectorBits - WordBits);
+  SplatBits = APInt(WordBits, Imm);
+  findSplat();
+}
+
 SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) {
   assert(BVN->isConstant() && "Expected a constant BUILD_VECTOR");
   bool HasAnyUndefs;
@@ -6336,6 +6347,83 @@
     }
   }
 
+  bool OnlyUsedByStores = true;
+  for (auto *U : Op1->uses()) {
+    if (StoreSDNode *ST = dyn_cast<StoreSDNode>(U)) {
+      EVT CurrMemVT = ST->getMemoryVT().getScalarType();
+      if (CurrMemVT.isRound() && CurrMemVT.getStoreSize() <= 16)
+        continue;
+    }
+    OnlyUsedByStores = false;
+    break;
+  }
+
+  // Replicate a reg or immediate with VREP instead of scalar mul / immediate
+  // load. It seems best to do this during the first DAGCombine as it is
+  // straight-forward to handle the zero-extend node in the initial DAG, and
+  // also not worry about the keeping the new MemVT legal (e.g. extracting an
+  // i16 element from a v16i8 vector).
+  if (Subtarget.hasVector() && OnlyUsedByStores &&
+      DCI.Level == BeforeLegalizeTypes) {
+    SDValue Word = SDValue();
+    EVT WordVT;
+
+    // Return a replicated word produced by MulOp. If found, return the value
+    // in Word and its type in WordVT.
+    auto FindReplicatedReg = [&](SDValue MulOp) {
+      EVT MulVT = MulOp.getValueType();
+      if (MulOp->getOpcode() == ISD::MUL &&
+          (MulVT == MVT::i16 || MulVT == MVT::i32 || MulVT == MVT::i64)) {
+        SDValue LHS = MulOp->getOperand(0);
+        if (LHS->getOpcode() == ISD::ZERO_EXTEND)
+          WordVT = LHS->getOperand(0).getValueType();
+        else if (LHS->getOpcode() == ISD::AssertZext)
+          WordVT = cast<VTSDNode>(LHS->getOperand(1))->getVT();
+        else
+          return;
+        if (auto *C = dyn_cast<ConstantSDNode>(MulOp->getOperand(1))) {
+          SystemZVectorConstantInfo VCI(C->getZExtValue(),
+                                        MulVT.getSizeInBits());
+          if (VCI.isVectorConstantLegal(Subtarget) &&
+              VCI.Opcode == SystemZISD::REPLICATE &&
+              VCI.OpVals[0] == 1 && WordVT == VCI.VecVT.getScalarType())
+            Word = DAG.getZExtOrTrunc(LHS->getOperand(0), SDLoc(SN), WordVT);
+        }
+      }
+    };
+
+    if (isa<BuildVectorSDNode>(Op1) &&
+        DAG.isSplatValue(Op1, true/*AllowUndefs*/)) {
+      if (auto *C = dyn_cast<ConstantSDNode>(Op1->getOperand(0))) {
+        if (C->getAPIntValue().getBitWidth() <= 64 &&
+            !isInt<16>(C->getSExtValue()) && !C->isAllOnes() &&
+            MemVT.getStoreSize() > 2) {
+          SystemZVectorConstantInfo VCI(C->getZExtValue(),
+                                        C->getValueType(0).getSizeInBits());
+          if (VCI.isVectorConstantLegal(Subtarget) &&
+              VCI.Opcode == SystemZISD::REPLICATE) {
+            Word = DAG.getConstant(VCI.OpVals[0], SDLoc(SN), MVT::i32);
+            WordVT = VCI.VecVT.getScalarType();
+          }
+        }
+      }
+      else
+        FindReplicatedReg(Op1->getOperand(0));
+    }
+    else
+        FindReplicatedReg(Op1);
+
+    if (Word != SDValue()) {
+      assert(MemVT.getSizeInBits() % WordVT.getSizeInBits() == 0 &&
+             "Bad type handling");
+      unsigned NumElts = MemVT.getSizeInBits() / WordVT.getSizeInBits();
+      EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), WordVT, NumElts);
+      SDValue SplatVal = DAG.getSplatVector(SplatVT, SDLoc(SN), Word);
+      return DAG.getStore(SN->getChain(), SDLoc(SN), SplatVal,
+                          SN->getBasePtr(), SN->getMemOperand());
+    }
+  }
+
   return SDValue();
 }
 
Index: llvm/test/CodeGen/SystemZ/store-replicated-vals.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/store-replicated-vals.ll
@@ -0,0 +1,373 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s
+
+define void @fun_2x1b(i8* %Src, i16* %Dst) {
+; CHECK-LABEL: fun_2x1b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepb %v0, 0(%r2)
+; CHECK-NEXT:    vsteh %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ %i = load i8, i8* %Src
+ %ZE = zext i8 %i to i16
+ %Val = mul i16 %ZE, 257
+ store i16 %Val, i16* %Dst
+ ret void
+}
+
+; multiple stores of same value
+define void @fun_4x1b(i8* %Src, i32* %Dst, i32* %Dst2) {
+; CHECK-LABEL: fun_4x1b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepb %v0, 0(%r2)
+; CHECK-NEXT:    vstef %v0, 0(%r3), 0
+; CHECK-NEXT:    vstef %v0, 0(%r4), 0
+; CHECK-NEXT:    br %r14
+ %i = load i8, i8* %Src
+ %ZE = zext i8 %i to i32
+ %Val = mul i32 %ZE, 16843009
+ store i32 %Val, i32* %Dst
+ store i32 %Val, i32* %Dst2
+ ret void
+}
+
+define void @fun_8x1b(i8* %Src, i64* %Dst) {
+; CHECK-LABEL: fun_8x1b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepb %v0, 0(%r2)
+; CHECK-NEXT:    vsteg %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ %i = load i8, i8* %Src
+ %ZE = zext i8 %i to i64
+ %Val = mul i64 %ZE, 72340172838076673
+ store i64 %Val, i64* %Dst
+ ret void
+}
+
+; A second truncated store of same value.
+define void @fun_8x1b_4x1b(i8* %Src, i64* %Dst, i32* %Dst2) {
+; CHECK-LABEL: fun_8x1b_4x1b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepb %v0, 0(%r2)
+; CHECK-NEXT:    vsteg %v0, 0(%r3), 0
+; CHECK-NEXT:    vstef %v0, 0(%r4), 0
+; CHECK-NEXT:    br %r14
+ %i = load i8, i8* %Src
+ %ZE = zext i8 %i to i64
+ %Val = mul i64 %ZE, 72340172838076673
+ store i64 %Val, i64* %Dst
+ %TrVal = trunc i64 %Val to i32
+ store i32 %TrVal, i32* %Dst2
+ ret void
+}
+
+define void @fun_2x2b(i16* %Src, i32* %Dst) {
+; CHECK-LABEL: fun_2x2b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlreph %v0, 0(%r2)
+; CHECK-NEXT:    vstef %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ %i = load i16, i16* %Src
+ %ZE = zext i16 %i to i32
+ %Val = mul i32 %ZE, 65537
+ store i32 %Val, i32* %Dst
+ ret void
+}
+
+define void @fun_4x2b(i16* %Src, i64* %Dst) {
+; CHECK-LABEL: fun_4x2b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlreph %v0, 0(%r2)
+; CHECK-NEXT:    vsteg %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ %i = load i16, i16* %Src
+ %ZE = zext i16 %i to i64
+ %Val = mul i64 %ZE, 281479271743489
+ store i64 %Val, i64* %Dst
+ ret void
+}
+
+define void @fun_2x4b(i32* %Src, i64* %Dst) {
+; CHECK-LABEL: fun_2x4b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vsteg %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ %i = load i32, i32* %Src
+ %ZE = zext i32 %i to i64
+ %Val = mul i64 %ZE, 4294967297
+ store i64 %Val, i64* %Dst
+ ret void
+}
+
+; Multiple stores of a replicated byte
+define void @fun_2x8x1b(i8* %Src, <2 x i64>* %Dst, <2 x i64>* %Dst2) {
+; CHECK-LABEL: fun_2x8x1b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepb %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    vst %v0, 0(%r4), 3
+; CHECK-NEXT:    br %r14
+ %i = load i8, i8* %Src
+ %ZE = zext i8 %i to i64
+ %Mul = mul i64 %ZE, 72340172838076673
+ %tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0
+ %Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer
+ store <2 x i64> %Val, <2 x i64>* %Dst
+ store <2 x i64> %Val, <2 x i64>* %Dst2
+ ret void
+}
+
+define void @fun_4x2x2b(i16* %Src, <4 x i32>* %Dst) {
+; CHECK-LABEL: fun_4x2x2b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlreph %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+ %i = load i16, i16* %Src
+ %ZE = zext i16 %i to i32
+ %Mul = mul i32 %ZE, 65537
+ %tmp = insertelement <4 x i32> undef, i32 %Mul, i32 0
+ %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer
+ store <4 x i32> %Val, <4 x i32>* %Dst
+ ret void
+}
+
+define void @fun_6x2x2b(i16* %Src, <6 x i32>* %Dst) {
+; CHECK-LABEL: fun_6x2x2b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlreph %v0, 0(%r2)
+; CHECK-NEXT:    vsteg %v0, 16(%r3), 0
+; CHECK-NEXT:    vst %v0, 0(%r3), 4
+; CHECK-NEXT:    br %r14
+ %i = load i16, i16* %Src
+ %ZE = zext i16 %i to i32
+ %Mul = mul i32 %ZE, 65537
+ %tmp = insertelement <6 x i32> undef, i32 %Mul, i32 0
+ %Val = shufflevector <6 x i32> %tmp, <6 x i32> undef, <6 x i32> zeroinitializer
+ store <6 x i32> %Val, <6 x i32>* %Dst
+ ret void
+}
+
+define void @fun_2x2x4b(i32* %Src, <2 x i64>* %Dst) {
+; CHECK-LABEL: fun_2x2x4b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+ %i = load i32, i32* %Src
+ %ZE = zext i32 %i to i64
+ %Mul = mul i64 %ZE, 4294967297
+ %tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0
+ %Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer
+ store <2 x i64> %Val, <2 x i64>* %Dst
+ ret void
+}
+
+define void @fun_5x2x4b(i32* %Src, <5 x i64>* %Dst) {
+; CHECK-LABEL: fun_5x2x4b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vsteg %v0, 32(%r3), 0
+; CHECK-NEXT:    vst %v0, 16(%r3), 4
+; CHECK-NEXT:    vst %v0, 0(%r3), 4
+; CHECK-NEXT:    br %r14
+ %i = load i32, i32* %Src
+ %ZE = zext i32 %i to i64
+ %Mul = mul i64 %ZE, 4294967297
+ %tmp = insertelement <5 x i64> undef, i64 %Mul, i32 0
+ %Val = shufflevector <5 x i64> %tmp, <5 x i64> undef, <5 x i32> zeroinitializer
+ store <5 x i64> %Val, <5 x i64>* %Dst
+ ret void
+}
+
+define void @fun_8x2b_arg(i8 %Arg, i64* %Dst) {
+; CHECK-LABEL: fun_8x2b_arg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlvgp %v0, %r2, %r2
+; CHECK-NEXT:    vrepb %v0, %v0, 7
+; CHECK-NEXT:    vsteg %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ %ZE = zext i8 %Arg to i64
+ %Val = mul i64 %ZE, 72340172838076673
+ store i64 %Val, i64* %Dst
+ ret void
+}
+
+; A replication of a non-local value (ISD::AssertZext case).
+define void @fun_nonlocalval() {
+; CHECK-LABEL: fun_nonlocalval:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lhi %r0, 0
+; CHECK-NEXT:    ciblh %r0, 0, 0(%r14)
+; CHECK-NEXT:  .LBB13_1: # %bb2
+; CHECK-NEXT:    llgf %r0, 0(%r1)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepf %v0, %v0, 1
+; CHECK-NEXT:    vst %v0, 0(%r1), 3
+; CHECK-NEXT:    br %r14
+  %i = load i32, i32* undef, align 4
+  br i1 undef, label %bb2, label %bb7
+
+bb2:                                              ; preds = %bb1
+  %i3 = zext i32 %i to i64
+  %i4 = mul nuw i64 %i3, 4294967297
+  %i5 = insertelement <2 x i64> poison, i64 %i4, i64 0
+  %i6 = shufflevector <2 x i64> %i5, <2 x i64> poison, <2 x i32> zeroinitializer
+  store <2 x i64> %i6, <2 x i64>* undef, align 8
+  ret void
+
+bb7:
+  ret void
+}
+
+;; Replicated immediates
+
+; Some cases where scalar instruction is better
+define void @fun_8_1i0(i64* %Dst) {
+; CHECK-LABEL: fun_8_1i0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvghi 0(%r2), 0
+; CHECK-NEXT:    br %r14
+ store i64 0, i64* %Dst
+ ret void
+}
+
+define void @fun_4_1iM1(i32* %Dst) {
+; CHECK-LABEL: fun_4_1iM1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvhi 0(%r2), -1
+; CHECK-NEXT:    br %r14
+ store i32 -1, i32* %Dst
+ ret void
+}
+
+define void @fun_4_1iAllOnes(i32* %Dst) {
+; CHECK-LABEL: fun_4_1iAllOnes:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvhi 0(%r2), -1
+; CHECK-NEXT:    br %r14
+ store i32 4294967295, i32* %Dst
+ ret void
+}
+
+define void @fun_2i(i16* %Dst) {
+; CHECK-LABEL: fun_2i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvhhi 0(%r2), 1
+; CHECK-NEXT:    br %r14
+ store i16 1, i16* %Dst
+ ret void
+}
+
+define void @fun_2_2i(i32* %Dst) {
+; CHECK-LABEL: fun_2_2i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrepih %v0, 1
+; CHECK-NEXT:    vstef %v0, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+ store i32 65537, i32* %Dst
+ ret void
+}
+
+define void @fun_4_2i(i64* %Dst) {
+; CHECK-LABEL: fun_4_2i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrepih %v0, 1
+; CHECK-NEXT:    vsteg %v0, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+ store i64 281479271743489, i64* %Dst
+ ret void
+}
+
+define void @fun_2_4i(i64* %Dst) {
+; CHECK-LABEL: fun_2_4i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrepif %v0, 1
+; CHECK-NEXT:    vsteg %v0, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+ store i64 4294967297, i64* %Dst
+ ret void
+}
+
+; Store replicated immediate twice using the same vector.
+define void @fun_4_1i(i32* %Dst, i32* %Dst2) {
+; CHECK-LABEL: fun_4_1i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrepib %v0, 3
+; CHECK-NEXT:    vstef %v0, 0(%r2), 0
+; CHECK-NEXT:    vstef %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ store i32 50529027, i32* %Dst
+ store i32 50529027, i32* %Dst2
+ ret void
+}
+
+define void @fun_8_1i(i64* %Dst, i64* %Dst2) {
+; CHECK-LABEL: fun_8_1i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrepib %v0, 1
+; CHECK-NEXT:    vsteg %v0, 0(%r2), 0
+; CHECK-NEXT:    vsteg %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ store i64 72340172838076673, i64* %Dst
+ store i64 72340172838076673, i64* %Dst2
+ ret void
+}
+
+; Similar, but with vectors.
+define void @fun_4_4_1i_2_4_1i(<4 x i32>* %Dst, <2 x i32>* %Dst2) {
+; CHECK-LABEL: fun_4_4_1i_2_4_1i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrepib %v0, 3
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    vsteg %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0
+ %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer
+ store <4 x i32> %Val, <4 x i32>* %Dst
+ %tmp2 = insertelement <2 x i32> undef, i32 50529027, i32 0
+ %Val2 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
+ store <2 x i32> %Val2, <2 x i32>* %Dst2
+ ret void
+}
+
+; Same, but 64-bit store is scalar.
+define void @fun_4_4_1i_2_4_1i_scalar(<4 x i32>* %Dst, i64* %Dst2) {
+; CHECK-LABEL: fun_4_4_1i_2_4_1i_scalar:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrepib %v0, 3
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    vsteg %v0, 0(%r3), 0
+; CHECK-NEXT:    br %r14
+ %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0
+ %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer
+ store <4 x i32> %Val, <4 x i32>* %Dst
+ store i64 217020518514230019, i64* %Dst2
+ ret void
+}
+
+define void @fun_3_2_4i(<3 x i64>* %Dst) {
+; CHECK-LABEL: fun_3_2_4i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrepif %v0, 1
+; CHECK-NEXT:    vsteg %v0, 16(%r2), 0
+; CHECK-NEXT:    vst %v0, 0(%r2), 4
+; CHECK-NEXT:    br %r14
+ %tmp = insertelement <3 x i64> undef, i64 4294967297, i32 0
+ %Val = shufflevector <3 x i64> %tmp, <3 x i64> undef, <3 x i32> zeroinitializer
+ store <3 x i64> %Val, <3 x i64>* %Dst
+ ret void
+}
+
+; i128 replicated '1'.
+define void @fun_128i(i128* %Dst) {
+; CHECK-LABEL: fun_128i:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrepib %v0, 1
+; CHECK-NEXT:    vsteg %v0, 8(%r2), 0
+; CHECK-NEXT:    vsteg %v0, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+ store i128 1334440654591915542993625911497130241, i128* %Dst
+ ret void
+}