Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -6308,6 +6308,104 @@ } } + bool OnlyUsedByStores = true; + for (auto *U : Op1->uses()) { + if (StoreSDNode *ST = dyn_cast(U)) { + EVT CurrMemVT = ST->getMemoryVT().getScalarType(); + if (CurrMemVT.isRound() && CurrMemVT.getStoreSize() <= 16) + continue; + } + OnlyUsedByStores = false; + break; + } + + // Replicate a reg or immediate with VREP instead of scalar mul / immediate + // load. It seems best to do this during the first DAGCombine as it is + // straight-forward to handle the zero-extend node in the initial DAG, and + // also not worry about the keeping the new MemVT legal (e.g. extracting an + // i16 element from a v16i8 vector). + if (OnlyUsedByStores && DCI.Level == BeforeLegalizeTypes) { + unsigned NumMemBytes = MemVT.getStoreSizeInBits() / 8; + + // Return true if CVal holds a replicated immediate of size WordBits + // where the immediates together take up at least TotBytes. The value is + // returned in SplatVal. TODO: Use SystemZVectorConstantInfo instead? + uint64_t SplatVal; + auto IsReplicatedImm = [&SplatVal](uint64_t CVal, unsigned WordBits, + unsigned TotBytes) { + assert((TotBytes % (WordBits / 8) == 0) && "Unhandled case."); + uint64_t Mask = (1L << WordBits) - 1; + SplatVal = CVal & Mask; + for (unsigned W = 1 ; W * WordBits / 8 < TotBytes; W++) + if (((CVal >> (W * WordBits)) & Mask) != SplatVal) + return false; + return true; + }; + + SDValue Word = SDValue(); + EVT WordVT; + + // Return a replicated immediate of C spanning TotBytes. If found, return + // the value in Word and the type in WordVT. + auto FindReplicatedImm = [&](ConstantSDNode *C, unsigned TotBytes) { + if (C->getAPIntValue().getBitWidth() > 64 || + isInt<16>(C->getSExtValue()) || C->isAllOnes() || NumMemBytes <= 2) + return; + for (unsigned WordBits = 8; WordBits <= 32; WordBits *= 2) + if (IsReplicatedImm(C->getZExtValue(), WordBits, TotBytes) && + isInt<16>(SplatVal)) { + Word = DAG.getConstant(SplatVal, SDLoc(SN), MVT::i32); + WordVT = EVT::getIntegerVT(*DAG.getContext(), WordBits); + break; + } + }; + + // Return a replicated word produced by MulOp. If found, return the value + // in Word and its type in WordVT. + auto FindReplicatedReg = [&](SDValue MulOp) { + EVT MulVT = MulOp.getValueType(); + if (MulOp->getOpcode() == ISD::MUL && + (MulVT == MVT::i16 || MulVT == MVT::i32 || MulVT == MVT::i64)) { + SDValue LHS = MulOp->getOperand(0); + if (LHS->getOpcode() == ISD::ZERO_EXTEND) + WordVT = LHS->getOperand(0).getValueType(); + else if (LHS->getOpcode() == ISD::AssertZext) + WordVT = cast(LHS->getOperand(1))->getVT(); + else + return; + if (auto *C = dyn_cast(MulOp->getOperand(1))) + if (WordVT.isRound() && + IsReplicatedImm(C->getZExtValue(), WordVT.getSizeInBits(), + MulVT.getStoreSize()) && SplatVal == 1) + Word = DAG.getZExtOrTrunc(LHS->getOperand(0), SDLoc(SN), WordVT); + } + }; + + if (isa(Op1) && + DAG.isSplatValue(Op1, true/*AllowUndefs*/)) { + if (auto *C = dyn_cast(Op1->getOperand(0))) + FindReplicatedImm(C, Op1->getOperand(0).getValueType() + .getStoreSize()); + else + FindReplicatedReg(Op1->getOperand(0)); + } + else { + if (auto *C = dyn_cast(Op1)) + FindReplicatedImm(C, NumMemBytes); + FindReplicatedReg(Op1); + } + + if (Word != SDValue()) { + assert(MemVT.getSizeInBits() % WordVT.getSizeInBits() == 0 && + "Bad type handling"); + unsigned NumElts = MemVT.getSizeInBits() / WordVT.getSizeInBits(); + EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), WordVT, NumElts); + SDValue SplatVal = DAG.getSplatVector(SplatVT, SDLoc(SN), Word); + return DAG.getStore(SN->getChain(), SDLoc(SN), SplatVal, + SN->getBasePtr(), SN->getMemOperand()); + } + } + return SDValue(); } Index: llvm/test/CodeGen/SystemZ/store-replicated-vals.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/store-replicated-vals.ll @@ -0,0 +1,375 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -store-repl-reg -store-repl-imm \ +; RUN: | FileCheck %s + +define void @fun_2x1b(i8* %Src, i16* %Dst) { +; CHECK-LABEL: fun_2x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vsteh %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i16 + %Val = mul i16 %ZE, 257 + store i16 %Val, i16* %Dst + ret void +} + +; multiple stores of same value +define void @fun_4x1b(i8* %Src, i32* %Dst, i32* %Dst2) { +; CHECK-LABEL: fun_4x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: vstef %v0, 0(%r4), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i32 + %Val = mul i32 %ZE, 16843009 + store i32 %Val, i32* %Dst + store i32 %Val, i32* %Dst2 + ret void +} + +define void @fun_8x1b(i8* %Src, i64* %Dst) { +; CHECK-LABEL: fun_8x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i64 + %Val = mul i64 %ZE, 72340172838076673 + store i64 %Val, i64* %Dst + ret void +} + +; A second truncated store of same value. +define void @fun_8x1b_4x1b(i8* %Src, i64* %Dst, i32* %Dst2) { +; CHECK-LABEL: fun_8x1b_4x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: vstef %v0, 0(%r4), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i64 + %Val = mul i64 %ZE, 72340172838076673 + store i64 %Val, i64* %Dst + %TrVal = trunc i64 %Val to i32 + store i32 %TrVal, i32* %Dst2 + ret void +} + +define void @fun_2x2b(i16* %Src, i32* %Dst) { +; CHECK-LABEL: fun_2x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i32 + %Val = mul i32 %ZE, 65537 + store i32 %Val, i32* %Dst + ret void +} + +define void @fun_4x2b(i16* %Src, i64* %Dst) { +; CHECK-LABEL: fun_4x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i64 + %Val = mul i64 %ZE, 281479271743489 + store i64 %Val, i64* %Dst + ret void +} + +define void @fun_2x4b(i32* %Src, i64* %Dst) { +; CHECK-LABEL: fun_2x4b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i32, i32* %Src + %ZE = zext i32 %i to i64 + %Val = mul i64 %ZE, 4294967297 + store i64 %Val, i64* %Dst + ret void +} + +; Multiple stores of a replicated byte +define void @fun_2x8x1b(i8* %Src, <2 x i64>* %Dst, <2 x i64>* %Dst2) { +; CHECK-LABEL: fun_2x8x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: vst %v0, 0(%r4), 3 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i64 + %Mul = mul i64 %ZE, 72340172838076673 + %tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0 + %Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer + store <2 x i64> %Val, <2 x i64>* %Dst + store <2 x i64> %Val, <2 x i64>* %Dst2 + ret void +} + +define void @fun_4x2x2b(i16* %Src, <4 x i32>* %Dst) { +; CHECK-LABEL: fun_4x2x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i32 + %Mul = mul i32 %ZE, 65537 + %tmp = insertelement <4 x i32> undef, i32 %Mul, i32 0 + %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %Val, <4 x i32>* %Dst + ret void +} + +define void @fun_6x2x2b(i16* %Src, <6 x i32>* %Dst) { +; CHECK-LABEL: fun_6x2x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 16(%r3), 0 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i32 + %Mul = mul i32 %ZE, 65537 + %tmp = insertelement <6 x i32> undef, i32 %Mul, i32 0 + %Val = shufflevector <6 x i32> %tmp, <6 x i32> undef, <6 x i32> zeroinitializer + store <6 x i32> %Val, <6 x i32>* %Dst + ret void +} + +define void @fun_2x2x4b(i32* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun_2x2x4b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %i = load i32, i32* %Src + %ZE = zext i32 %i to i64 + %Mul = mul i64 %ZE, 4294967297 + %tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0 + %Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer + store <2 x i64> %Val, <2 x i64>* %Dst + ret void +} + +define void @fun_5x2x4b(i32* %Src, <5 x i64>* %Dst) { +; CHECK-LABEL: fun_5x2x4b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 32(%r3), 0 +; CHECK-NEXT: vst %v0, 16(%r3), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + %i = load i32, i32* %Src + %ZE = zext i32 %i to i64 + %Mul = mul i64 %ZE, 4294967297 + %tmp = insertelement <5 x i64> undef, i64 %Mul, i32 0 + %Val = shufflevector <5 x i64> %tmp, <5 x i64> undef, <5 x i32> zeroinitializer + store <5 x i64> %Val, <5 x i64>* %Dst + ret void +} + +define void @fun_8x2b_arg(i8 %Arg, i64* %Dst) { +; CHECK-LABEL: fun_8x2b_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r2, %r2 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %ZE = zext i8 %Arg to i64 + %Val = mul i64 %ZE, 72340172838076673 + store i64 %Val, i64* %Dst + ret void +} + +; A replication of a non-local value (ISD::AssertZext case). +define void @fun_nonlocalval() { +; CHECK-LABEL: fun_nonlocalval: +; CHECK: # %bb.0: +; CHECK-NEXT: lhi %r0, 0 +; CHECK-NEXT: ciblh %r0, 0, 0(%r14) +; CHECK-NEXT: .LBB13_1: # %bb2 +; CHECK-NEXT: llgf %r0, 0(%r1) +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: vrepf %v0, %v0, 1 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: br %r14 + %i = load i32, i32* undef, align 4 + br i1 undef, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %i3 = zext i32 %i to i64 + %i4 = mul nuw i64 %i3, 4294967297 + %i5 = insertelement <2 x i64> poison, i64 %i4, i64 0 + %i6 = shufflevector <2 x i64> %i5, <2 x i64> poison, <2 x i32> zeroinitializer + store <2 x i64> %i6, <2 x i64>* undef, align 8 + ret void + +bb7: + ret void +} + +;; Replicated immediates + +; Some cases where scalar instruction is better +define void @fun_8_1i0(i64* %Dst) { +; CHECK-LABEL: fun_8_1i0: +; CHECK: # %bb.0: +; CHECK-NEXT: mvghi 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i64 0, i64* %Dst + ret void +} + +define void @fun_4_1iM1(i32* %Dst) { +; CHECK-LABEL: fun_4_1iM1: +; CHECK: # %bb.0: +; CHECK-NEXT: mvhi 0(%r2), -1 +; CHECK-NEXT: br %r14 + store i32 -1, i32* %Dst + ret void +} + +define void @fun_4_1iAllOnes(i32* %Dst) { +; CHECK-LABEL: fun_4_1iAllOnes: +; CHECK: # %bb.0: +; CHECK-NEXT: mvhi 0(%r2), -1 +; CHECK-NEXT: br %r14 + store i32 4294967295, i32* %Dst + ret void +} + +define void @fun_2i(i16* %Dst) { +; CHECK-LABEL: fun_2i: +; CHECK: # %bb.0: +; CHECK-NEXT: mvhhi 0(%r2), 1 +; CHECK-NEXT: br %r14 + store i16 1, i16* %Dst + ret void +} + +define void @fun_2_2i(i32* %Dst) { +; CHECK-LABEL: fun_2_2i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepih %v0, 1 +; CHECK-NEXT: vstef %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i32 65537, i32* %Dst + ret void +} + +define void @fun_4_2i(i64* %Dst) { +; CHECK-LABEL: fun_4_2i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepih %v0, 1 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i64 281479271743489, i64* %Dst + ret void +} + +define void @fun_2_4i(i64* %Dst) { +; CHECK-LABEL: fun_2_4i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepif %v0, 1 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i64 4294967297, i64* %Dst + ret void +} + +; Store replicated immediate twice using the same vector. +define void @fun_4_1i(i32* %Dst, i32* %Dst2) { +; CHECK-LABEL: fun_4_1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: vstef %v0, 0(%r2), 0 +; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + store i32 50529027, i32* %Dst + store i32 50529027, i32* %Dst2 + ret void +} + +define void @fun_8_1i(i64* %Dst, i64* %Dst2) { +; CHECK-LABEL: fun_8_1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + store i64 72340172838076673, i64* %Dst + store i64 72340172838076673, i64* %Dst2 + ret void +} + +; Similar, but with vectors. +define void @fun_4_4_1i_2_4_1i(<4 x i32>* %Dst, <2 x i32>* %Dst2) { +; CHECK-LABEL: fun_4_4_1i_2_4_1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0 + %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %Val, <4 x i32>* %Dst + %tmp2 = insertelement <2 x i32> undef, i32 50529027, i32 0 + %Val2 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer + store <2 x i32> %Val2, <2 x i32>* %Dst2 + ret void +} + +; Same, but 64-bit store is scalar. +define void @fun_4_4_1i_2_4_1i_scalar(<4 x i32>* %Dst, i64* %Dst2) { +; CHECK-LABEL: fun_4_4_1i_2_4_1i_scalar: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0 + %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %Val, <4 x i32>* %Dst + store i64 217020518514230019, i64* %Dst2 + ret void +} + +define void @fun_3_2_4i(<3 x i64>* %Dst) { +; CHECK-LABEL: fun_3_2_4i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepif %v0, 1 +; CHECK-NEXT: vsteg %v0, 16(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + %tmp = insertelement <3 x i64> undef, i64 4294967297, i32 0 + %Val = shufflevector <3 x i64> %tmp, <3 x i64> undef, <3 x i32> zeroinitializer + store <3 x i64> %Val, <3 x i64>* %Dst + ret void +} + +; i128 replicated '1': not using vrepib, but should compile. +define void @fun_128i(i128* %Dst) { +; CHECK-LABEL: fun_128i: +; CHECK: # %bb.0: +; CHECK-NEXT: llihf %r0, 16843009 +; CHECK-NEXT: oilf %r0, 16843009 +; CHECK-NEXT: stg %r0, 8(%r2) +; CHECK-NEXT: stg %r0, 0(%r2) +; CHECK-NEXT: br %r14 + store i128 1334440654591915542993625911497130241, i128* %Dst + ret void +}