Index: llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -304,7 +304,7 @@ uint64_t UpperVal, uint64_t LowerVal); void loadVectorConstant(const SystemZVectorConstantInfo &VCI, - SDNode *Node); + SDNode *Node, EVT VT); // Try to use gather instruction Opcode to implement vector insertion N. bool tryGather(SDNode *N, unsigned Opcode); @@ -1147,13 +1147,12 @@ } void SystemZDAGToDAGISel::loadVectorConstant( - const SystemZVectorConstantInfo &VCI, SDNode *Node) { + const SystemZVectorConstantInfo &VCI, SDNode *Node, EVT VT) { assert((VCI.Opcode == SystemZISD::BYTE_MASK || VCI.Opcode == SystemZISD::REPLICATE || VCI.Opcode == SystemZISD::ROTATE_MASK) && "Bad opcode!"); assert(VCI.VecVT.getSizeInBits() == 128 && "Expected a vector type"); - EVT VT = Node->getValueType(0); SDLoc DL(Node); SmallVector Ops; for (unsigned OpVal : VCI.OpVals) @@ -1166,11 +1165,20 @@ SDValue BitCast = CurDAG->getNode(ISD::BITCAST, DL, VT, Op); ReplaceNode(Node, BitCast.getNode()); SelectCode(BitCast.getNode()); - } else { // float or double + } else if (VT.isFloatingPoint()) { unsigned SubRegIdx = (VT.getSizeInBits() == 32 ? SystemZ::subreg_h32 : SystemZ::subreg_h64); ReplaceNode( Node, CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, Op).getNode()); + } else { + unsigned NumBytes = VT.getStoreSize(); + assert((NumBytes == 4 || NumBytes == 8) && "Unexpected vector element size"); + EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), VT, + SystemZ::VectorBytes / NumBytes); + SDValue BitCast = CurDAG->getBitcast(VecVT, Op); + SDValue ValueToUse = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + BitCast, CurDAG->getVectorIdxConstant(0, DL)); + ReplaceNode(Node, ValueToUse.getNode()); } SelectCode(Op.getNode()); } @@ -1503,6 +1511,9 @@ return true; } +// EXPERIMENTAL +static cl::opt REPLICATE_ONLY("replicate-only", cl::init(false), cl::Hidden); + void SystemZDAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we already have selected! if (Node->isMachineOpcode()) { @@ -1634,7 +1645,7 @@ auto *BVN = cast(Node); SystemZVectorConstantInfo VCI(BVN); if (VCI.isVectorConstantLegal(*Subtarget)) { - loadVectorConstant(VCI, Node); + loadVectorConstant(VCI, Node, Node->getValueType(0)); return; } break; @@ -1647,7 +1658,7 @@ SystemZVectorConstantInfo VCI(Imm); bool Success = VCI.isVectorConstantLegal(*Subtarget); (void)Success; assert(Success && "Expected legal FP immediate"); - loadVectorConstant(VCI, Node); + loadVectorConstant(VCI, Node, Node->getValueType(0)); return; } @@ -1655,6 +1666,7 @@ if (tryFoldLoadStoreIntoMemOperand(Node)) return; auto *Store = cast(Node); + auto &Op1 = Node->getOperand(1); unsigned ElemBitSize = Store->getValue().getValueSizeInBits(); if (ElemBitSize == 32) { if (tryScatter(Store, SystemZ::VSCEF)) @@ -1663,6 +1675,31 @@ if (tryScatter(Store, SystemZ::VSCEG)) return; } + if (auto *C = dyn_cast(Op1)) { + EVT MemVT = Store->getMemoryVT(); + unsigned NumMemBytes = MemVT.getStoreSize(); + if (C->getAPIntValue().getBitWidth() <= 64 && + !isInt<16>(C->getSExtValue()) && !C->isAllOnes() && NumMemBytes > 2) { + SmallVector Stores; + for (auto *U : C->uses()) + if (StoreSDNode *ST = dyn_cast(U)) + Stores.push_back(ST); + if (Stores.size() == C->use_size()) { + SystemZVectorConstantInfo VCI(C->getZExtValue(), NumMemBytes * 8); + if (VCI.isVectorConstantLegal(*Subtarget) && + (VCI.Opcode == SystemZISD::REPLICATE || !REPLICATE_ONLY)) { + loadVectorConstant(VCI, Op1.getNode(), MemVT); + // Need to select all stores into VSTE before the bitcast is removed. + for (auto *STNode : Stores) + SelectCode(STNode); + auto &StoredVal = Node->getOperand(0); + if (StoredVal->getOpcode() == ISD::BITCAST) + SelectCode(StoredVal.getNode()); + return; + } + } + } + } break; } } Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -753,12 +753,13 @@ APInt SplatUndef; // Bits correspoding to undef operands of the BVN. unsigned SplatBitSize = 0; bool isFP128 = false; - + void findSplat(); public: unsigned Opcode = 0; SmallVector OpVals; MVT VecVT; SystemZVectorConstantInfo(APFloat FPImm); + SystemZVectorConstantInfo(uint64_t Imm, unsigned WordBits); SystemZVectorConstantInfo(BuildVectorSDNode *BVN); bool isVectorConstantLegal(const SystemZSubtarget &Subtarget); }; Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -790,14 +790,9 @@ return tryValue(SplatBitsZ | Middle); } -SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) { - IntBits = FPImm.bitcastToAPInt().zextOrSelf(128); - isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad()); - SplatBits = FPImm.bitcastToAPInt(); - unsigned Width = SplatBits.getBitWidth(); - IntBits <<= (SystemZ::VectorBits - Width); - +void SystemZVectorConstantInfo::findSplat() { // Find the smallest splat. + unsigned Width = SplatBits.getBitWidth(); while (Width > 8) { unsigned HalfSize = Width / 2; APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize); @@ -814,6 +809,22 @@ SplatBitSize = Width; } +SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) { + IntBits = FPImm.bitcastToAPInt().zextOrSelf(128); + isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad()); + SplatBits = FPImm.bitcastToAPInt(); + IntBits <<= (SystemZ::VectorBits - SplatBits.getBitWidth()); + findSplat(); +} + +SystemZVectorConstantInfo::SystemZVectorConstantInfo(uint64_t Imm, + unsigned WordBits) { + IntBits = APInt(128, Imm); + IntBits <<= (SystemZ::VectorBits - WordBits); + SplatBits = APInt(WordBits, Imm); + findSplat(); +} + SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) { assert(BVN->isConstant() && "Expected a constant BUILD_VECTOR"); bool HasAnyUndefs; @@ -6336,6 +6347,83 @@ } } + bool OnlyUsedByStores = true; + for (auto *U : Op1->uses()) { + if (StoreSDNode *ST = dyn_cast(U)) { + EVT CurrMemVT = ST->getMemoryVT().getScalarType(); + if (CurrMemVT.isRound() && CurrMemVT.getStoreSize() <= 16) + continue; + } + OnlyUsedByStores = false; + break; + } + + // Replicate a reg or immediate with VREP instead of scalar mul / immediate + // load. It seems best to do this during the first DAGCombine as it is + // straight-forward to handle the zero-extend node in the initial DAG, and + // also not worry about the keeping the new MemVT legal (e.g. extracting an + // i16 element from a v16i8 vector). + if (Subtarget.hasVector() && OnlyUsedByStores && + DCI.Level == BeforeLegalizeTypes) { + SDValue Word = SDValue(); + EVT WordVT; + + // Return a replicated word produced by MulOp. If found, return the value + // in Word and its type in WordVT. + auto FindReplicatedReg = [&](SDValue MulOp) { + EVT MulVT = MulOp.getValueType(); + if (MulOp->getOpcode() == ISD::MUL && + (MulVT == MVT::i16 || MulVT == MVT::i32 || MulVT == MVT::i64)) { + SDValue LHS = MulOp->getOperand(0); + if (LHS->getOpcode() == ISD::ZERO_EXTEND) + WordVT = LHS->getOperand(0).getValueType(); + else if (LHS->getOpcode() == ISD::AssertZext) + WordVT = cast(LHS->getOperand(1))->getVT(); + else + return; + if (auto *C = dyn_cast(MulOp->getOperand(1))) { + SystemZVectorConstantInfo VCI(C->getZExtValue(), + MulVT.getSizeInBits()); + if (VCI.isVectorConstantLegal(Subtarget) && + VCI.Opcode == SystemZISD::REPLICATE && + VCI.OpVals[0] == 1 && WordVT == VCI.VecVT.getScalarType()) + Word = DAG.getZExtOrTrunc(LHS->getOperand(0), SDLoc(SN), WordVT); + } + } + }; + + if (isa(Op1) && + DAG.isSplatValue(Op1, true/*AllowUndefs*/)) { + if (auto *C = dyn_cast(Op1->getOperand(0))) { + if (C->getAPIntValue().getBitWidth() <= 64 && + !isInt<16>(C->getSExtValue()) && !C->isAllOnes() && + MemVT.getStoreSize() > 2) { + SystemZVectorConstantInfo VCI(C->getZExtValue(), + C->getValueType(0).getSizeInBits()); + if (VCI.isVectorConstantLegal(Subtarget) && + VCI.Opcode == SystemZISD::REPLICATE) { + Word = DAG.getConstant(VCI.OpVals[0], SDLoc(SN), MVT::i32); + WordVT = VCI.VecVT.getScalarType(); + } + } + } + else + FindReplicatedReg(Op1->getOperand(0)); + } + else + FindReplicatedReg(Op1); + + if (Word != SDValue()) { + assert(MemVT.getSizeInBits() % WordVT.getSizeInBits() == 0 && + "Bad type handling"); + unsigned NumElts = MemVT.getSizeInBits() / WordVT.getSizeInBits(); + EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), WordVT, NumElts); + SDValue SplatVal = DAG.getSplatVector(SplatVT, SDLoc(SN), Word); + return DAG.getStore(SN->getChain(), SDLoc(SN), SplatVal, + SN->getBasePtr(), SN->getMemOperand()); + } + } + return SDValue(); } Index: llvm/test/CodeGen/SystemZ/store-replicated-vals.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/store-replicated-vals.ll @@ -0,0 +1,373 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s + +define void @fun_2x1b(i8* %Src, i16* %Dst) { +; CHECK-LABEL: fun_2x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vsteh %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i16 + %Val = mul i16 %ZE, 257 + store i16 %Val, i16* %Dst + ret void +} + +; multiple stores of same value +define void @fun_4x1b(i8* %Src, i32* %Dst, i32* %Dst2) { +; CHECK-LABEL: fun_4x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: vstef %v0, 0(%r4), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i32 + %Val = mul i32 %ZE, 16843009 + store i32 %Val, i32* %Dst + store i32 %Val, i32* %Dst2 + ret void +} + +define void @fun_8x1b(i8* %Src, i64* %Dst) { +; CHECK-LABEL: fun_8x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i64 + %Val = mul i64 %ZE, 72340172838076673 + store i64 %Val, i64* %Dst + ret void +} + +; A second truncated store of same value. +define void @fun_8x1b_4x1b(i8* %Src, i64* %Dst, i32* %Dst2) { +; CHECK-LABEL: fun_8x1b_4x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: vstef %v0, 0(%r4), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i64 + %Val = mul i64 %ZE, 72340172838076673 + store i64 %Val, i64* %Dst + %TrVal = trunc i64 %Val to i32 + store i32 %TrVal, i32* %Dst2 + ret void +} + +define void @fun_2x2b(i16* %Src, i32* %Dst) { +; CHECK-LABEL: fun_2x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i32 + %Val = mul i32 %ZE, 65537 + store i32 %Val, i32* %Dst + ret void +} + +define void @fun_4x2b(i16* %Src, i64* %Dst) { +; CHECK-LABEL: fun_4x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i64 + %Val = mul i64 %ZE, 281479271743489 + store i64 %Val, i64* %Dst + ret void +} + +define void @fun_2x4b(i32* %Src, i64* %Dst) { +; CHECK-LABEL: fun_2x4b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i32, i32* %Src + %ZE = zext i32 %i to i64 + %Val = mul i64 %ZE, 4294967297 + store i64 %Val, i64* %Dst + ret void +} + +; Multiple stores of a replicated byte +define void @fun_2x8x1b(i8* %Src, <2 x i64>* %Dst, <2 x i64>* %Dst2) { +; CHECK-LABEL: fun_2x8x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: vst %v0, 0(%r4), 3 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i64 + %Mul = mul i64 %ZE, 72340172838076673 + %tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0 + %Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer + store <2 x i64> %Val, <2 x i64>* %Dst + store <2 x i64> %Val, <2 x i64>* %Dst2 + ret void +} + +define void @fun_4x2x2b(i16* %Src, <4 x i32>* %Dst) { +; CHECK-LABEL: fun_4x2x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i32 + %Mul = mul i32 %ZE, 65537 + %tmp = insertelement <4 x i32> undef, i32 %Mul, i32 0 + %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %Val, <4 x i32>* %Dst + ret void +} + +define void @fun_6x2x2b(i16* %Src, <6 x i32>* %Dst) { +; CHECK-LABEL: fun_6x2x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 16(%r3), 0 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i32 + %Mul = mul i32 %ZE, 65537 + %tmp = insertelement <6 x i32> undef, i32 %Mul, i32 0 + %Val = shufflevector <6 x i32> %tmp, <6 x i32> undef, <6 x i32> zeroinitializer + store <6 x i32> %Val, <6 x i32>* %Dst + ret void +} + +define void @fun_2x2x4b(i32* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun_2x2x4b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %i = load i32, i32* %Src + %ZE = zext i32 %i to i64 + %Mul = mul i64 %ZE, 4294967297 + %tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0 + %Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer + store <2 x i64> %Val, <2 x i64>* %Dst + ret void +} + +define void @fun_5x2x4b(i32* %Src, <5 x i64>* %Dst) { +; CHECK-LABEL: fun_5x2x4b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 32(%r3), 0 +; CHECK-NEXT: vst %v0, 16(%r3), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + %i = load i32, i32* %Src + %ZE = zext i32 %i to i64 + %Mul = mul i64 %ZE, 4294967297 + %tmp = insertelement <5 x i64> undef, i64 %Mul, i32 0 + %Val = shufflevector <5 x i64> %tmp, <5 x i64> undef, <5 x i32> zeroinitializer + store <5 x i64> %Val, <5 x i64>* %Dst + ret void +} + +define void @fun_8x2b_arg(i8 %Arg, i64* %Dst) { +; CHECK-LABEL: fun_8x2b_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r2, %r2 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %ZE = zext i8 %Arg to i64 + %Val = mul i64 %ZE, 72340172838076673 + store i64 %Val, i64* %Dst + ret void +} + +; A replication of a non-local value (ISD::AssertZext case). +define void @fun_nonlocalval() { +; CHECK-LABEL: fun_nonlocalval: +; CHECK: # %bb.0: +; CHECK-NEXT: lhi %r0, 0 +; CHECK-NEXT: ciblh %r0, 0, 0(%r14) +; CHECK-NEXT: .LBB13_1: # %bb2 +; CHECK-NEXT: llgf %r0, 0(%r1) +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: vrepf %v0, %v0, 1 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: br %r14 + %i = load i32, i32* undef, align 4 + br i1 undef, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %i3 = zext i32 %i to i64 + %i4 = mul nuw i64 %i3, 4294967297 + %i5 = insertelement <2 x i64> poison, i64 %i4, i64 0 + %i6 = shufflevector <2 x i64> %i5, <2 x i64> poison, <2 x i32> zeroinitializer + store <2 x i64> %i6, <2 x i64>* undef, align 8 + ret void + +bb7: + ret void +} + +;; Replicated immediates + +; Some cases where scalar instruction is better +define void @fun_8_1i0(i64* %Dst) { +; CHECK-LABEL: fun_8_1i0: +; CHECK: # %bb.0: +; CHECK-NEXT: mvghi 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i64 0, i64* %Dst + ret void +} + +define void @fun_4_1iM1(i32* %Dst) { +; CHECK-LABEL: fun_4_1iM1: +; CHECK: # %bb.0: +; CHECK-NEXT: mvhi 0(%r2), -1 +; CHECK-NEXT: br %r14 + store i32 -1, i32* %Dst + ret void +} + +define void @fun_4_1iAllOnes(i32* %Dst) { +; CHECK-LABEL: fun_4_1iAllOnes: +; CHECK: # %bb.0: +; CHECK-NEXT: mvhi 0(%r2), -1 +; CHECK-NEXT: br %r14 + store i32 4294967295, i32* %Dst + ret void +} + +define void @fun_2i(i16* %Dst) { +; CHECK-LABEL: fun_2i: +; CHECK: # %bb.0: +; CHECK-NEXT: mvhhi 0(%r2), 1 +; CHECK-NEXT: br %r14 + store i16 1, i16* %Dst + ret void +} + +define void @fun_2_2i(i32* %Dst) { +; CHECK-LABEL: fun_2_2i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepih %v0, 1 +; CHECK-NEXT: vstef %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i32 65537, i32* %Dst + ret void +} + +define void @fun_4_2i(i64* %Dst) { +; CHECK-LABEL: fun_4_2i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepih %v0, 1 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i64 281479271743489, i64* %Dst + ret void +} + +define void @fun_2_4i(i64* %Dst) { +; CHECK-LABEL: fun_2_4i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepif %v0, 1 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i64 4294967297, i64* %Dst + ret void +} + +; Store replicated immediate twice using the same vector. +define void @fun_4_1i(i32* %Dst, i32* %Dst2) { +; CHECK-LABEL: fun_4_1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: vstef %v0, 0(%r2), 0 +; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + store i32 50529027, i32* %Dst + store i32 50529027, i32* %Dst2 + ret void +} + +define void @fun_8_1i(i64* %Dst, i64* %Dst2) { +; CHECK-LABEL: fun_8_1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + store i64 72340172838076673, i64* %Dst + store i64 72340172838076673, i64* %Dst2 + ret void +} + +; Similar, but with vectors. +define void @fun_4_4_1i_2_4_1i(<4 x i32>* %Dst, <2 x i32>* %Dst2) { +; CHECK-LABEL: fun_4_4_1i_2_4_1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0 + %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %Val, <4 x i32>* %Dst + %tmp2 = insertelement <2 x i32> undef, i32 50529027, i32 0 + %Val2 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer + store <2 x i32> %Val2, <2 x i32>* %Dst2 + ret void +} + +; Same, but 64-bit store is scalar. +define void @fun_4_4_1i_2_4_1i_scalar(<4 x i32>* %Dst, i64* %Dst2) { +; CHECK-LABEL: fun_4_4_1i_2_4_1i_scalar: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0 + %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %Val, <4 x i32>* %Dst + store i64 217020518514230019, i64* %Dst2 + ret void +} + +define void @fun_3_2_4i(<3 x i64>* %Dst) { +; CHECK-LABEL: fun_3_2_4i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepif %v0, 1 +; CHECK-NEXT: vsteg %v0, 16(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + %tmp = insertelement <3 x i64> undef, i64 4294967297, i32 0 + %Val = shufflevector <3 x i64> %tmp, <3 x i64> undef, <3 x i32> zeroinitializer + store <3 x i64> %Val, <3 x i64>* %Dst + ret void +} + +; i128 replicated '1'. +define void @fun_128i(i128* %Dst) { +; CHECK-LABEL: fun_128i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vsteg %v0, 8(%r2), 0 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i128 1334440654591915542993625911497130241, i128* %Dst + ret void +}