diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -627,8 +627,8 @@ SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, - unsigned UnpackHigh) const; + SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; bool canTreatAsByteVector(EVT VT) const; diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -4467,12 +4467,22 @@ } static bool isZeroVector(SDValue N) { + if (N->getOpcode() == ISD::BITCAST) + N = N->getOperand(0); if (N->getOpcode() == ISD::SPLAT_VECTOR) if (auto *Op = dyn_cast(N->getOperand(0))) return Op->getZExtValue() == 0; return ISD::isBuildVectorAllZeros(N.getNode()); } +// Return the index of the zero/undef vector, or UINT32_MAX if not found. +static uint32_t findZeroVectorIdx(SDValue *Ops, unsigned Num) { + for (unsigned I = 0; I < Num ; I++) + if (isZeroVector(Ops[I])) + return I; + return UINT32_MAX; +} + // Bytes is a VPERM-like permute vector, except that -1 is used for // undefined bytes. Implement it on operands Ops[0] and Ops[1] using // VSLDB or VPERM. @@ -4491,9 +4501,8 @@ // Fall back on VPERM. Construct an SDNode for the permute vector. Try to // eliminate a zero vector by reusing any zero index in the permute vector. - unsigned ZeroVecIdx = - isZeroVector(Ops[0]) ? 0 : (isZeroVector(Ops[1]) ? 1 : UINT_MAX); - if (ZeroVecIdx != UINT_MAX) { + unsigned ZeroVecIdx = findZeroVectorIdx(&Ops[0], 2); + if (ZeroVecIdx != UINT32_MAX) { bool MaskFirst = true; int ZeroIdx = -1; for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { @@ -4551,10 +4560,13 @@ namespace { // Describes a general N-operand vector shuffle. struct GeneralShuffle { - GeneralShuffle(EVT vt) : VT(vt) {} + GeneralShuffle(EVT vt) : VT(vt), UnpackFromEltSize(UINT_MAX) {} void addUndef(); bool add(SDValue, unsigned); SDValue getNode(SelectionDAG &, const SDLoc &); + void tryPrepareForUnpack(); + bool unpackWasPrepared() { return UnpackFromEltSize <= 4; } + SDValue insertUnpackIfPrepared(SelectionDAG &DAG, const SDLoc &DL, SDValue Op); // The operands of the shuffle. SmallVector Ops; @@ -4566,6 +4578,9 @@ // The type of the shuffle result. EVT VT; + + // Holds a value of 1, 2 or 4 if a final unpack has been prepared for. + unsigned UnpackFromEltSize; }; } @@ -4648,6 +4663,9 @@ if (Ops.size() == 0) return DAG.getUNDEF(VT); + // Use a single unpack if possible as the last operation. + tryPrepareForUnpack(); + // Make sure that there are at least two shuffle operands. if (Ops.size() == 1) Ops.push_back(DAG.getUNDEF(MVT::v16i8)); @@ -4713,13 +4731,117 @@ // to VPERM. unsigned OpNo0, OpNo1; SDValue Op; - if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1)) + if (unpackWasPrepared() && Ops[1].isUndef()) + Op = Ops[0]; + else if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1)) Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]); else Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes); + + Op = insertUnpackIfPrepared(DAG, DL, Op); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); } +#ifndef NDEBUG +static void dumpBytes(const SmallVectorImpl &Bytes, std::string Msg) { + dbgs() << Msg.c_str() << " { "; + for (unsigned i = 0; i < Bytes.size(); i++) + dbgs() << Bytes[i] << " "; + dbgs() << "}\n"; +} +#endif + +// If the Bytes vector matches an unpack operation, prepare to do the unpack +// after all else by removing the zero vector and the effect of the unpack on +// Bytes. +void GeneralShuffle::tryPrepareForUnpack() { + uint32_t ZeroVecOpNo = findZeroVectorIdx(&Ops[0], Ops.size()); + if (ZeroVecOpNo == UINT32_MAX || Ops.size() == 1) + return; + + // Only do this if removing the zero vector reduces the depth, otherwise + // the critical path will increase with the final unpack. + if (Ops.size() > 2 && + Log2_32_Ceil(Ops.size()) == Log2_32_Ceil(Ops.size() - 1)) + return; + + // Find an unpack that would allow removing the zero vector from Ops. + UnpackFromEltSize = 1; + for (; UnpackFromEltSize <= 4; UnpackFromEltSize *= 2) { + bool MatchUnpack = true; + SmallVector SrcBytes; + for (unsigned Elt = 0; Elt < SystemZ::VectorBytes; Elt++) { + unsigned ToEltSize = UnpackFromEltSize * 2; + bool IsZextByte = (Elt % ToEltSize) < UnpackFromEltSize; + if (!IsZextByte) + SrcBytes.push_back(Bytes[Elt]); + if (Bytes[Elt] != -1) { + unsigned OpNo = unsigned(Bytes[Elt]) / SystemZ::VectorBytes; + if (IsZextByte != (OpNo == ZeroVecOpNo)) { + MatchUnpack = false; + break; + } + } + } + if (MatchUnpack) { + if (Ops.size() == 2) { + // Don't use unpack if a single source operand needs rearrangement. + for (unsigned i = 0; i < SystemZ::VectorBytes / 2; i++) + if (SrcBytes[i] != -1 && SrcBytes[i] % 16 != int(i)) { + UnpackFromEltSize = UINT_MAX; + return; + } + } + break; + } + } + if (UnpackFromEltSize > 4) + return; + + LLVM_DEBUG(dbgs() << "Preparing for final unpack of element size " + << UnpackFromEltSize << ". Zero vector is Op#" << ZeroVecOpNo + << ".\n"; + dumpBytes(Bytes, "Original Bytes vector:");); + + // Apply the unpack in reverse to the Bytes array. + unsigned B = 0; + for (unsigned Elt = 0; Elt < SystemZ::VectorBytes;) { + Elt += UnpackFromEltSize; + for (unsigned i = 0; i < UnpackFromEltSize; i++, Elt++, B++) + Bytes[B] = Bytes[Elt]; + } + while (B < SystemZ::VectorBytes) + Bytes[B++] = -1; + + // Remove the zero vector from Ops + Ops.erase(&Ops[ZeroVecOpNo]); + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) + if (Bytes[I] >= 0) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + if (OpNo > ZeroVecOpNo) + Bytes[I] -= SystemZ::VectorBytes; + } + + LLVM_DEBUG(dumpBytes(Bytes, "Resulting Bytes vector, zero vector removed:"); + dbgs() << "\n";); +} + +SDValue GeneralShuffle::insertUnpackIfPrepared(SelectionDAG &DAG, + const SDLoc &DL, + SDValue Op) { + if (!unpackWasPrepared()) + return Op; + unsigned InBits = UnpackFromEltSize * 8; + EVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBits), + SystemZ::VectorBits / InBits); + SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, InVT, Op); + unsigned OutBits = InBits * 2; + EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(OutBits), + SystemZ::VectorBits / OutBits); + return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp); +} + // Return true if the given BUILD_VECTOR is a scalar-to-vector conversion. static bool isScalarToVector(SDValue Op) { for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I) @@ -5114,9 +5236,8 @@ return DAG.getNode(ISD::BITCAST, DL, VT, Res); } -SDValue -SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, - unsigned UnpackHigh) const { +SDValue SystemZTargetLowering:: +lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const { SDValue PackedOp = Op.getOperand(0); EVT OutVT = Op.getValueType(); EVT InVT = PackedOp.getValueType(); @@ -5126,11 +5247,39 @@ FromBits *= 2; EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits), SystemZ::VectorBits / FromBits); - PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp); + PackedOp = + DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp); } while (FromBits != ToBits); return PackedOp; } +// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector. +SDValue SystemZTargetLowering:: +lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const { + SDValue PackedOp = Op.getOperand(0); + SDLoc DL(Op); + EVT OutVT = Op.getValueType(); + EVT InVT = PackedOp.getValueType(); + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned OutNumElts = OutVT.getVectorNumElements(); + unsigned NumInPerOut = InNumElts / OutNumElts; + + SDValue ZeroVec = + DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType())); + + SmallVector Mask(InNumElts); + unsigned ZeroVecElt = InNumElts; + for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) { + unsigned MaskElt = PackedElt * NumInPerOut; + unsigned End = MaskElt + NumInPerOut - 1; + for (; MaskElt < End; MaskElt++) + Mask[MaskElt] = ZeroVecElt++; + Mask[MaskElt] = PackedElt; + } + SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask); + return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf); +} + SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const { // Look for cases where a vector shift can use the *_BY_SCALAR form. @@ -5296,9 +5445,9 @@ case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::SIGN_EXTEND_VECTOR_INREG: - return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH); + return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG); case ISD::ZERO_EXTEND_VECTOR_INREG: - return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH); + return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG); case ISD::SHL: return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR); case ISD::SRL: diff --git a/llvm/test/CodeGen/SystemZ/vec-move-16.ll b/llvm/test/CodeGen/SystemZ/vec-move-16.ll --- a/llvm/test/CodeGen/SystemZ/vec-move-16.ll +++ b/llvm/test/CodeGen/SystemZ/vec-move-16.ll @@ -40,9 +40,10 @@ ; Test a v4i8->v4i32 extension. define <4 x i32> @f5(<4 x i8> *%ptr) { ; CHECK-LABEL: f5: +; CHECK: larl %r1, .LCPI4_0 ; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) -; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]] -; CHECK: vuplhh %v24, [[REG2]] +; CHECK: vl %v1, 0(%r1), 3 +; CHECK: vperm %v24, %v1, [[REG1]], %v1 ; CHECK: br %r14 %val = load <4 x i8>, <4 x i8> *%ptr %ret = zext <4 x i8> %val to <4 x i32> @@ -71,10 +72,10 @@ ; Test a v2i8->v2i64 extension. define <2 x i64> @f8(<2 x i8> *%ptr) { ; CHECK-LABEL: f8: -; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2) -; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]] -; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]] -; CHECK: vuplhf %v24, [[REG3]] +; CHECK: larl %r1, .LCPI7_0 +; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vl %v1, 0(%r1), 3 +; CHECK: vperm %v24, %v1, [[REG1]], %v1 ; CHECK: br %r14 %val = load <2 x i8>, <2 x i8> *%ptr %ret = zext <2 x i8> %val to <2 x i64> @@ -84,9 +85,10 @@ ; Test a v2i16->v2i64 extension. define <2 x i64> @f9(<2 x i16> *%ptr) { ; CHECK-LABEL: f9: -; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) -; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]] -; CHECK: vuplhf %v24, [[REG2]] +; CHECK: larl %r1, .LCPI8_0 +; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vl %v1, 0(%r1), 3 +; CHECK: vperm %v24, %v1, [[REG1]], %v1 ; CHECK: br %r14 %val = load <2 x i16>, <2 x i16> *%ptr %ret = zext <2 x i16> %val to <2 x i64> diff --git a/llvm/test/CodeGen/SystemZ/vec-move-23.ll b/llvm/test/CodeGen/SystemZ/vec-move-23.ll --- a/llvm/test/CodeGen/SystemZ/vec-move-23.ll +++ b/llvm/test/CodeGen/SystemZ/vec-move-23.ll @@ -68,9 +68,9 @@ define void @fun4(<2 x i8> %Src, <2 x double>* %Dst) { ; CHECK-LABEL: fun4: -; CHECK: vuplhb %v0, %v24 -; CHECK-NEXT: vuplhh %v0, %v0 -; CHECK-NEXT: vuplhf %v0, %v0 +; CHECK: larl %r1, .LCPI4_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v0, %v0, %v24, %v0 ; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0 ; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 @@ -81,8 +81,9 @@ define void @fun5(<2 x i16> %Src, <2 x double>* %Dst) { ; CHECK-LABEL: fun5: -; CHECK: vuplhh %v0, %v24 -; CHECK-NEXT: vuplhf %v0, %v0 +; CHECK: larl %r1, .LCPI5_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v0, %v0, %v24, %v0 ; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0 ; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 diff --git a/llvm/test/CodeGen/SystemZ/vec-move-24.ll b/llvm/test/CodeGen/SystemZ/vec-move-24.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-move-24.ll @@ -0,0 +1,49 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; +; Test that vperm is not used if a single unpack is enough. + +define <4 x i32> @fun0(<4 x i32>* %Src) nounwind { +; CHECK-LABEL: fun0: +; CHECK-NOT: vperm + %tmp = load <4 x i32>, <4 x i32>* %Src + %tmp2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> + ret <4 x i32> %tmp2 +} + +define void @fun1(i8 %Src, <32 x i8>* %Dst) nounwind { +; CHECK-LABEL: fun1: +; CHECK-NOT: vperm + %I0 = insertelement <16 x i8> undef, i8 %Src, i32 0 + %I1 = insertelement <16 x i8> %I0, i8 %Src, i32 1 + %I2 = insertelement <16 x i8> %I1, i8 %Src, i32 2 + %I3 = insertelement <16 x i8> %I2, i8 %Src, i32 3 + %I4 = insertelement <16 x i8> %I3, i8 %Src, i32 4 + %I5 = insertelement <16 x i8> %I4, i8 %Src, i32 5 + %I6 = insertelement <16 x i8> %I5, i8 %Src, i32 6 + %I7 = insertelement <16 x i8> %I6, i8 %Src, i32 7 + %I8 = insertelement <16 x i8> %I7, i8 %Src, i32 8 + %I9 = insertelement <16 x i8> %I8, i8 %Src, i32 9 + %I10 = insertelement <16 x i8> %I9, i8 %Src, i32 10 + %I11 = insertelement <16 x i8> %I10, i8 %Src, i32 11 + %I12 = insertelement <16 x i8> %I11, i8 %Src, i32 12 + %I13 = insertelement <16 x i8> %I12, i8 %Src, i32 13 + %I14 = insertelement <16 x i8> %I13, i8 %Src, i32 14 + %I15 = insertelement <16 x i8> %I14, i8 %Src, i32 15 + + %tmp = shufflevector <16 x i8> zeroinitializer, + <16 x i8> %I15, + <32 x i32> + %tmp9 = shufflevector <32 x i8> undef, + <32 x i8> %tmp, + <32 x i32> + + store <32 x i8> %tmp9, <32 x i8>* %Dst + ret void +} + diff --git a/llvm/test/CodeGen/SystemZ/vec-zext.ll b/llvm/test/CodeGen/SystemZ/vec-zext.ll --- a/llvm/test/CodeGen/SystemZ/vec-zext.ll +++ b/llvm/test/CodeGen/SystemZ/vec-zext.ll @@ -1,5 +1,5 @@ -; Test that vector zexts are done efficently with unpack instructions also in -; case of fewer elements than allowed, e.g. <2 x i32>. +; Test that vector zexts are done efficently also in case of fewer elements +; than allowed, e.g. <2 x i32>. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s @@ -14,8 +14,9 @@ define <2 x i32> @fun2(<2 x i8> %val1) { ; CHECK-LABEL: fun2: -; CHECK: vuplhb %v0, %v24 -; CHECK-NEXT: vuplhh %v24, %v0 +; CHECK: larl %r1, .LCPI1_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v24, %v0 ; CHECK-NEXT: br %r14 %z = zext <2 x i8> %val1 to <2 x i32> ret <2 x i32> %z @@ -23,9 +24,9 @@ define <2 x i64> @fun3(<2 x i8> %val1) { ; CHECK-LABEL: fun3: -; CHECK: vuplhb %v0, %v24 -; CHECK-NEXT: vuplhh %v0, %v0 -; CHECK-NEXT: vuplhf %v24, %v0 +; CHECK: larl %r1, .LCPI2_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v24, %v0 ; CHECK-NEXT: br %r14 %z = zext <2 x i8> %val1 to <2 x i64> ret <2 x i64> %z @@ -41,8 +42,9 @@ define <2 x i64> @fun5(<2 x i16> %val1) { ; CHECK-LABEL: fun5: -; CHECK: vuplhh %v0, %v24 -; CHECK-NEXT: vuplhf %v24, %v0 +; CHECK: larl %r1, .LCPI4_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v24, %v0 ; CHECK-NEXT: br %r14 %z = zext <2 x i16> %val1 to <2 x i64> ret <2 x i64> %z @@ -66,8 +68,9 @@ define <4 x i32> @fun8(<4 x i8> %val1) { ; CHECK-LABEL: fun8: -; CHECK: vuplhb %v0, %v24 -; CHECK-NEXT: vuplhh %v24, %v0 +; CHECK: larl %r1, .LCPI7_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v24, %v0 ; CHECK-NEXT: br %r14 %z = zext <4 x i8> %val1 to <4 x i32> ret <4 x i32> %z