Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -4447,6 +4447,98 @@ return Op; } +// Return true if the given Byte is a known zero. +bool isByteZero(SDValue N, int Byte) { + if (N->getOpcode() == ISD::SPLAT_VECTOR) + if (auto *Op = dyn_cast(N->getOperand(0))) + return Op->getZExtValue() == 0; + if (N->getOpcode() == ISD::BUILD_VECTOR) { + EVT VT = N.getValueType(); + unsigned Idx = Byte / (VT.getScalarSizeInBits() / 8); + if (auto *Op = dyn_cast(N->getOperand(Idx))) + return Op->getZExtValue() == 0; + } + return false; +} + +// Keeps track of the bytes that would result after applying one or several +// unpacks. +struct UnpackInfo { + enum { ZEXT_VAL = -2 }; + SmallVector UnpackedBytes; + SmallVector AppliedUnpacks; + SmallVector Mask; + bool NeedsPermute; + SDValue SourceOp; + UnpackInfo() : NeedsPermute(false) { + for (unsigned El = 0; El < SystemZ::VectorBytes; El++) + UnpackedBytes.push_back(El); + } + + // Apply one unpack on UnpackedBytes and insert ZEXT_VAL on the bytes that + // become zero. + void applyUnpack(unsigned FromSize) { + SmallVector Tmp; + unsigned El = 0; + while (El < SystemZ::VectorBytes / 2) { + for (unsigned i = 0; i < FromSize; i++) + Tmp.push_back(ZEXT_VAL); + for (unsigned i = 0; i < FromSize; i++) + Tmp.push_back(UnpackedBytes[El++]); + } + UnpackedBytes = Tmp; + AppliedUnpacks.push_back(FromSize); + } + + // Return true if the applied unpacks can be used to produce the result + // required by Bytes of Ops[OpIdx]. If the used source operand needs + // rearrangement before unpacking, NeedsPermute is set and Mask holds the + // required order. + bool tryApply(const SmallVectorImpl &Bytes, SDValue *Ops, + unsigned OpIdx) { + Mask.assign(SystemZ::VectorBytes, -1); + NeedsPermute = false; + for (unsigned i = 0; i < SystemZ::VectorBytes; ++i) { + if (Bytes[i] == -1) + continue; + unsigned OpNo = Bytes[i] / SystemZ::VectorBytes; + unsigned Byte = Bytes[i] % SystemZ::VectorBytes; + if (isByteZero(Ops[OpNo], Byte) && UnpackedBytes[i] == ZEXT_VAL) + continue; + if (OpNo != OpIdx || UnpackedBytes[i] == ZEXT_VAL) + return false; + Mask[UnpackedBytes[i]] = Byte; + } + for (unsigned i = 0; i < SystemZ::VectorBytes; ++i) + if (Mask[i] != -1 && Mask[i] != int(i)) { + NeedsPermute = true; + break; + } + SourceOp = Ops[OpIdx]; + return true; + } + + uint32_t getCost() { + if (AppliedUnpacks.empty() && !NeedsPermute) + return UINT32_MAX; + return AppliedUnpacks.size() + (NeedsPermute ? 4 : 0); + } +}; + +#ifndef NDEBUG +static void dumpBytes(const SmallVectorImpl &Bytes, std::string Msg) { + dbgs() << Msg.c_str() << " { "; + for (unsigned i = 0; i < Bytes.size(); i++) { + if (Bytes[i] == UnpackInfo::ZEXT_VAL) + dbgs() << "Z"; + else + dbgs() << Bytes[i]; + dbgs() << ", "; + } + dbgs() << "}\n"; +} +#endif + // Bytes is a VPERM-like permute vector, except that -1 is used for // undefined bytes. Implement it on operands Ops[0] and Ops[1] using // VSLDB or VPERM. @@ -4474,6 +4566,102 @@ return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2); } +// EXPERIMENTAL This can be used to tune the threshold as to when unpacking +// would be used instead of a vperm. It is compared to the total number of +// instrucions needed, which may include also shufffling of the source +// operand before unpacking. +static cl::opt MAXUNPACKOPS("max-unpackops", cl::init(4)); + +// Detect cases where one of the source operands can be used while the other +// bytes are zero or undefined. +static SDValue tryShuffleWithZeroVec(SelectionDAG &DAG, + const SDLoc &DL, SDValue *Ops, + const SmallVectorImpl &Bytes) { + // Don't use unpacks if a single VSLDB could be used. + unsigned StartIndex, OpNo0, OpNo1; + if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1)) + return SDValue(); + + // Try unpack(s) + UnpackInfo Best; + for (unsigned Start = 1; Start <= 4; Start *= 2) { + UnpackInfo UPI; + for (unsigned From = Start; From <= 4; From *= 2) { + UPI.applyUnpack(From); + if (UPI.tryApply(Bytes, Ops, 0) && Best.getCost() > UPI.getCost()) + Best = UPI; + if (UPI.tryApply(Bytes, Ops, 1) && Best.getCost() > UPI.getCost()) + Best = UPI; + } + } + if (Best.getCost() == UINT32_MAX || Best.AppliedUnpacks.size() > MAXUNPACKOPS) + return SDValue(); + + LLVM_DEBUG(dbgs() << "\nUnpacking:\n"; + Ops[0].dump(); + Ops[1].dump(); + dumpBytes(Bytes, "Given this 'Bytes' mask:");); + LLVM_DEBUG(dbgs() << Best.AppliedUnpacks.size() + << " unpack(s) needed of t" << + Best.SourceOp.getNode()->PersistentId << " to produce "; + dumpBytes(Best.UnpackedBytes, "");); + if (Best.NeedsPermute) { + LLVM_DEBUG(dumpBytes(Best.Mask, "Shuffling before unpacking with mask:");); + unsigned OpNo0, OpNo1; + if (Best.SourceOp->use_empty() && + Best.SourceOp->getOpcode() == SystemZISD::PERMUTE) { + // If source is a PERMUTE then modify it instead of emitting another one. + SDValue OrigMask = Best.SourceOp->getOperand(2); + SDValue IndexNodes[SystemZ::VectorBytes]; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + int MaskEl = Best.Mask[I]; + if (MaskEl < 0 || OrigMask.getOperand(MaskEl).isUndef()) + IndexNodes[I] = DAG.getUNDEF(MVT::i32); + else { + auto *COp = cast(OrigMask.getOperand(MaskEl)); + IndexNodes[I] = DAG.getConstant(COp->getZExtValue(), DL, MVT::i32); + } + } + SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes); + Best.SourceOp = DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, + Best.SourceOp.getOperand(0), + Best.SourceOp.getOperand(1), Op2); + } + else if (const Permute *P = matchPermute(Best.Mask, OpNo0, OpNo1)) { + if (Best.AppliedUnpacks.size() + 1 > MAXUNPACKOPS) + return SDValue(); + Best.SourceOp = getPermuteNode(DAG, DL, *P, Best.SourceOp, Best.SourceOp); + } + else { + if (isShlDoublePermute(Best.Mask, StartIndex, OpNo0, OpNo1)) { + if (Best.AppliedUnpacks.size() + 1 > MAXUNPACKOPS) + return SDValue(); + } else { + // In order to unpack, a permute of the source is needed. If it's not a + // VSLDB then only do this in the cases where a single unpack is needed + // after the VPERM. + if (Best.AppliedUnpacks.size() > 1) + return SDValue(); + } + SDValue SubOps[] = { Best.SourceOp, Best.SourceOp}; + Best.SourceOp = getGeneralPermuteNode(DAG, DL, SubOps, Best.Mask); + } + } + + // Build the unpacks. + unsigned FromBits = Best.AppliedUnpacks.front() * 8; + EVT FromVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits), + SystemZ::VectorBits / FromBits); + SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, FromVT, Best.SourceOp); + for (unsigned FromSize : Best.AppliedUnpacks) { + unsigned ToBits = FromSize * 8 * 2; + EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ToBits), + SystemZ::VectorBits / ToBits); + PackedOp = DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp); + } + return PackedOp; +} + namespace { // Describes a general N-operand vector shuffle. struct GeneralShuffle { @@ -4578,6 +4766,39 @@ if (Ops.size() == 1) Ops.push_back(DAG.getUNDEF(MVT::v16i8)); + // Put a zero vector last to attempt unpacking as the last operation if + // there are more than two operands. + uint32_t ZeroVecOpNo = UINT32_MAX; + if (Ops.size() > 2) { + // Find the zero vector. + for (unsigned I = 0; I < Ops.size() ; I++) + if (ISD::isBuildVectorAllZeros(Ops[I].getNode()) || + (Ops[I]->getOpcode() == ISD::SPLAT_VECTOR && + isa(Ops[I]->getOperand(0)) && + cast(Ops[I]->getOperand(0))->getZExtValue() == 0)) { + ZeroVecOpNo = I; + break; + } + // Move it to the last position without rearranging the others. + unsigned LastOpNo = Ops.size() - 1; + if (ZeroVecOpNo != UINT32_MAX && ZeroVecOpNo != LastOpNo) { + SDValue ZeroOp = Ops[ZeroVecOpNo]; + for (unsigned I = ZeroVecOpNo; I < Ops.size() - 1 ; I++) + Ops[I] = Ops[I + 1]; + Ops[LastOpNo] = ZeroOp; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + if (Bytes[I] >= 0) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + if (OpNo > ZeroVecOpNo) + Bytes[I] -= SystemZ::VectorBytes; + else if (OpNo == ZeroVecOpNo) + Bytes[I] += SystemZ::VectorBytes * (LastOpNo - ZeroVecOpNo); + } + } + ZeroVecOpNo = LastOpNo; + } + } + // Create a tree of shuffles, deferring root node until after the loop. // Try to redistribute the undefined elements of non-root nodes so that // the non-root shuffles match something like a pack or merge, then adjust @@ -4588,8 +4809,14 @@ // In the best case this redistribution will lead to the whole tree // using packs and merges. It should rarely be a loss in other cases. unsigned Stride = 1; - for (; Stride * 2 < Ops.size(); Stride *= 2) { + // The Zero vector needs to be handled at last. Do this by skipping it + // below while doing one more iteration with the rest leaving only one + // other result. + unsigned ItersFactor = (ZeroVecOpNo == UINT32_MAX ? 2 : 1); + for (; Stride * ItersFactor < Ops.size(); Stride *= 2) { for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) { + if (I + Stride == ZeroVecOpNo) + continue; SDValue SubOps[] = { Ops[I], Ops[I + Stride] }; // Create a mask for just these two operands. @@ -4628,21 +4855,25 @@ } // Now we just have 2 inputs. Put the second operand in Ops[1]. - if (Stride > 1) { - Ops[1] = Ops[Stride]; + unsigned RHSOpNo = ZeroVecOpNo == UINT32_MAX ? Stride :ZeroVecOpNo; + if (RHSOpNo > 1) { + Ops[1] = Ops[RHSOpNo]; for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) if (Bytes[I] >= int(SystemZ::VectorBytes)) - Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes; + Bytes[I] -= (RHSOpNo - 1) * SystemZ::VectorBytes; } - // Look for an instruction that can do the permute without resorting // to VPERM. unsigned OpNo0, OpNo1; SDValue Op; if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1)) Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]); - else - Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes); + else { + Op = tryShuffleWithZeroVec(DAG, DL, &Ops[0], Bytes); + if (!Op.getNode()) + Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes); + } + return DAG.getNode(ISD::BITCAST, DL, VT, Op); } @@ -5048,6 +5279,27 @@ EVT InVT = PackedOp.getValueType(); unsigned ToBits = OutVT.getScalarSizeInBits(); unsigned FromBits = InVT.getScalarSizeInBits(); + + // Expand a ZERO_EXTEND_VECTOR_INREG to a vector shuffle. + if (UnpackHigh == SystemZISD::UNPACKL_HIGH) { + SDLoc DL(Op); + SDValue ZeroVec = + DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType())); + unsigned FromNumElts = InVT.getVectorNumElements(); + unsigned ToNumElts = OutVT.getVectorNumElements(); + unsigned NumNarr = ToBits / FromBits; + SmallVector Mask(FromNumElts); + for (unsigned i = 0; i < ToNumElts; i++) { + unsigned E = i * NumNarr; + unsigned Z = 0; + for (; Z < NumNarr - 1; Z++) + Mask[E + Z] = FromNumElts; + Mask[E + Z] = i; + } + SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask); + return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf); + } + do { FromBits *= 2; EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits), Index: llvm/test/CodeGen/SystemZ/vec-move-24.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/vec-move-24.ll @@ -0,0 +1,128 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; +; Test that not vperm is used if not necessary in some cases. + +; Zero extension of a vector compare. +define <4 x i32> @fun0(<4 x i8> %arg0, <4 x i8> %arg1) nounwind { +; CHECK-LABEL: fun0: +; CHECK-NOT: vperm + %tmp = icmp uge <4 x i8> %arg0, %arg1 + %tmp1 = zext <4 x i1> %tmp to <4 x i32> + ret <4 x i32> %tmp1 +} + +; Don't do unpacks + vperm if vsldb can be used +define <4 x i16> @fun1() { +; CHECK-LABEL: fun1: +; CHECK: vlrepg +; CHECK-NEXT: vsldb +; CHECK-NEXT: veslh +; CHECK-NEXT: br + %tmp = load <8 x i8>, <8 x i8>* undef + %tmp3 = shufflevector <8 x i8> %tmp, <8 x i8> undef, <4 x i32> + %tmp4 = zext <4 x i8> %tmp3 to <4 x i16> + %tmp5 = shl nuw <4 x i16> %tmp4, + ret <4 x i16> %tmp5 +} + +; Some cases of zero extensions by shuffling with a zero vector. +define <16 x i8> @fun2(<16 x i8>* %Src) nounwind { +; CHECK-LABEL: fun2: +; CHECK-NOT: vperm + %tmp = load <16 x i8>, <16 x i8>* %Src + %tmp2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %tmp, <16 x i32> + ret <16 x i8> %tmp2 +} + +define <8 x i16> @fun3(<8 x i16>* %Src) nounwind { +; CHECK-LABEL: fun3: +; CHECK-NOT: vperm + %tmp = load <8 x i16>, <8 x i16>* %Src + %tmp2 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp, <8 x i32> + ret <8 x i16> %tmp2 +} + +define <4 x i32> @fun4(<4 x i32>* %Src) nounwind { +; CHECK-LABEL: fun4: +; CHECK-NOT: vperm + %tmp = load <4 x i32>, <4 x i32>* %Src + %tmp2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> + ret <4 x i32> %tmp2 +} + +; This does not require a vperm, but only a single unpack. +define void @fun5(i8 %Src, <32 x i8>* %Dst) nounwind { +; CHECK-LABEL: fun5: +; CHECK-NOT: vperm + %I0 = insertelement <16 x i8> undef, i8 %Src, i32 0 + %I1 = insertelement <16 x i8> %I0, i8 %Src, i32 1 + %I2 = insertelement <16 x i8> %I1, i8 %Src, i32 2 + %I3 = insertelement <16 x i8> %I2, i8 %Src, i32 3 + %I4 = insertelement <16 x i8> %I3, i8 %Src, i32 4 + %I5 = insertelement <16 x i8> %I4, i8 %Src, i32 5 + %I6 = insertelement <16 x i8> %I5, i8 %Src, i32 6 + %I7 = insertelement <16 x i8> %I6, i8 %Src, i32 7 + %I8 = insertelement <16 x i8> %I7, i8 %Src, i32 8 + %I9 = insertelement <16 x i8> %I8, i8 %Src, i32 9 + %I10 = insertelement <16 x i8> %I9, i8 %Src, i32 10 + %I11 = insertelement <16 x i8> %I10, i8 %Src, i32 11 + %I12 = insertelement <16 x i8> %I11, i8 %Src, i32 12 + %I13 = insertelement <16 x i8> %I12, i8 %Src, i32 13 + %I14 = insertelement <16 x i8> %I13, i8 %Src, i32 14 + %I15 = insertelement <16 x i8> %I14, i8 %Src, i32 15 + + %tmp = shufflevector <16 x i8> zeroinitializer, + <16 x i8> %I15, + <32 x i32> + %tmp9 = shufflevector <32 x i8> undef, + <32 x i8> %tmp, + <32 x i32> + + store <32 x i8> %tmp9, <32 x i8>* %Dst + ret void +} + +; Unpacking after a vsldb, and not a vperm. +define void @fun6() { +; CHECK-LABEL: fun6: +; CHECK-NOT: vperm + %tmp = load <16 x i8>, <16 x i8>* undef, align 1 + %tmp2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %tmp, <16 x i32> + store <16 x i8> %tmp2, <16 x i8>* undef + ret void +} + +; This should only require one vperm. +define void @fun7(<4 x i16>* %Src, <4 x i16>* %Src2, <4 x i32>* %Dst) { +; CHECK-LABEL: fun7: +; CHECK: vperm +; CHECK-NOT: vperm + %l = load <4 x i16>, <4 x i16>* %Src + %l2 = load <4 x i16>, <4 x i16>* %Src2 + %sh = shufflevector <4 x i16> %l, <4 x i16> %l2, <4 x i32> + %z = zext <4 x i16> %sh to <4 x i32> + store <4 x i32> %z, <4 x i32>* %Dst + ret void +} + +; Should only require two vperms. +define void @fun8() { +; CHECK-LABEL: fun8: +; CHECK: vperm +; CHECK: vperm +; CHECK-NOT: vperm + %tmp = load <48 x i8>, <48 x i8>* null, align 1 + %tmp6 = shufflevector <48 x i8> %tmp, <48 x i8> undef, <8 x i32> + %tmp7 = zext <8 x i8> %tmp6 to <8 x i16> + %tmp8 = shufflevector <8 x i16> %tmp7, <8 x i16> undef, <16 x i32> + %tmp9 = shufflevector <16 x i16> %tmp8, <16 x i16> undef, <32 x i32> + %tmp10 = shufflevector <32 x i16> %tmp9, <32 x i16> undef, <40 x i32> + store <40 x i16> %tmp10, <40 x i16>* undef, align 2 + ret void +}