Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -4548,6 +4548,7 @@ void addUndef(); bool add(SDValue, unsigned); SDValue getNode(SelectionDAG &, const SDLoc &); + SDValue tryVLLEZ(SelectionDAG &DAG); void tryPrepareForUnpack(); bool unpackWasPrepared() { return UnpackFromEltSize <= 4; } SDValue insertUnpackIfPrepared(SelectionDAG &DAG, const SDLoc &DL, SDValue Op); @@ -4647,6 +4648,12 @@ if (Ops.size() == 0) return DAG.getUNDEF(VT); + if (Ops.size() == 2) { + SDValue VLLEZOp = tryVLLEZ(DAG); + if (VLLEZOp.getNode()) + return VLLEZOp; + } + // Use a single unpack if possible as the last operation. tryPrepareForUnpack(); @@ -4728,14 +4735,145 @@ } #ifndef NDEBUG -static void dumpBytes(const SmallVectorImpl &Bytes, std::string Msg) { - dbgs() << Msg.c_str() << " { "; +// For use within gdb. +static void dumpBytesVec(const SmallVectorImpl &Bytes) { + dbgs() << "{ "; for (unsigned i = 0; i < Bytes.size(); i++) dbgs() << Bytes[i] << " "; dbgs() << "}\n"; } + +static void dumpBytes(const SmallVectorImpl &Bytes, std::string Msg) { + dbgs() << Msg.c_str() << " "; + dumpBytesVec(Bytes); +} #endif +static LoadSDNode *getSingleUseByShuffleVectorLoad(SDValue N) { + if (!N.hasOneUse()) + return nullptr; + + // Check that N is only used by one vector shuffle. + SDValue UsedVal = N; + if (N->use_begin()->getOpcode() == ISD::BITCAST) + UsedVal = SDValue(*N->use_begin(), 0); + if (!UsedVal.hasOneUse() || + UsedVal->use_begin()->getOpcode() != ISD::VECTOR_SHUFFLE) + return nullptr; + + // Find the load by looking through any type conversions. + bool Change = true; + while (Change) { + Change = false; + switch (N->getOpcode()) { + case ISD::BITCAST: // XXX Needed? + if (!N->getOperand(0).getValueType().isVector()) + return nullptr; + LLVM_FALLTHROUGH; + case ISD::SCALAR_TO_VECTOR: + if (!N.hasOneUse()) + return nullptr; + N = N->getOperand(0); + Change = true; + break; + default: break; + } + } + return dyn_cast(N); +} + +SDValue GeneralShuffle::tryVLLEZ(SelectionDAG &DAG) { + // Try VLLEZ (+ VLE) + uint32_t ZeroVecOpNo = findZeroVectorIdx(&Ops[0], 2); + if (ZeroVecOpNo == UINT32_MAX) + return SDValue(); + + unsigned LoadOpNo = ZeroVecOpNo == 0 ? 1 : 0; + LoadSDNode *Load = getSingleUseByShuffleVectorLoad(Ops[LoadOpNo]); + if (Load == nullptr || Load->isVolatile()) + return SDValue(); + + struct ElementLoad { + unsigned StartPos; + unsigned StartByte; + unsigned Count; + ElementLoad() : StartPos(0), StartByte(0), Count(0) {} + } Loads[2]; + unsigned CurrLd = 0; + for (unsigned i = 0; i < SystemZ::VectorBytes; i++) { + if (Bytes[i] == - 1) + continue; + unsigned OpNo = Bytes[i] / SystemZ::VectorBytes; + unsigned Byte = Bytes[i] % SystemZ::VectorBytes; + if (OpNo == ZeroVecOpNo) + continue; + if (CurrLd == 2) // Only load one more other element. + return SDValue(); + ElementLoad &L = Loads[CurrLd++]; + L.StartPos = i++; + L.StartByte = Byte; + while (i < SystemZ::VectorBytes && Bytes[i - 1] + 1 == Bytes[i] && + Bytes[i] / SystemZ::VectorBytes == OpNo) + i++; + L.Count = i - L.StartPos; + } + if (Loads[0].StartPos + Loads[0].Count != SystemZ::VectorBytes / 2) + return SDValue(); + unsigned VLLEZLoad = 0; + unsigned VLELoad = UINT32_MAX; + if (CurrLd == 2) { + if (Loads[1].StartPos + Loads[1].Count != SystemZ::VectorBytes) + return SDValue(); + VLELoad = 1; + } + + if (CurrLd != 2 || // Handle exactly two loaded elements + Loads[0].Count != Loads[1].Count || // of the same size + Loads[0].Count > 2 || // of i8 or i16 + Loads[0].StartByte < Loads[1].StartByte) // with shuffled elements. + return SDValue(); + + LLVM_DEBUG(dbgs() << "VLLEZ match!\n";); + + // Load the first element and insert it into a zero vector (VLLEZ). + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + SDLoc DL(Load); + MVT ScalarVT = MVT::getIntegerVT(Loads[VLLEZLoad].Count * 8); + MVT VecVT = MVT::getVectorVT(ScalarVT, + SystemZ::VectorBytes / Loads[VLLEZLoad].Count); + const SDValue &BaseAddr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + SDValue Zeroes = DAG.getSplatBuildVector(VecVT, DL, + DAG.getConstant(0, DL, MVT::i32)); + SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, + DAG.getIntPtrConstant(Loads[VLLEZLoad].StartByte, DL)); + SDValue Ld0 = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Load->getChain(), + Address, ScalarVT, MMO); + SDValue Ins0 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Zeroes, Ld0, + DAG.getVectorIdxConstant(VecVT.getVectorNumElements() / 2 - 1, DL)); + + // Load the other element (VLE). + Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, + DAG.getIntPtrConstant(Loads[VLELoad].StartByte, DL)); + SDValue Ld1 = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld0.getValue(1), + Address, ScalarVT, MMO); + SDValue Ins1 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Ins0, Ld1, + DAG.getVectorIdxConstant(VecVT.getVectorNumElements() - 1, DL)); + + // Update chains. + SDValue LoadCh = SDValue(Load, 1); + if (!LoadCh.use_empty()) { + SDValue TF = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + Ld0.getValue(1), Ld1.getValue(1)); + DAG.ReplaceAllUsesOfValueWith(LoadCh, TF); + SmallVector Ops; + Ops.push_back(LoadCh); + Ops.push_back(Ld1.getValue(1)); + DAG.UpdateNodeOperands(TF.getNode(), Ops); + } + return Ins1; +} + // If the Bytes vector matches an unpack operation, prepare to do the unpack // after all else by removing the zero vector and the effect of the unpack on // Bytes. Index: llvm/test/CodeGen/SystemZ/vec-move-25.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/vec-move-25.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; +; Test that shuffled and zero extended vector loads gets implemented +; with vllez + vle in the case of a <2 x i64> result. + +define void @fun0(<2 x i8>* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun0: +; CHECK: vllezb %v0, 1(%r2) +; CHECK-NEXT: vleb %v0, 0(%r2), 15 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <2 x i8>, <2 x i8>* %Src + %sh = shufflevector <2 x i8> %l, <2 x i8> undef, <2 x i32> + %z = zext <2 x i8> %sh to <2 x i64> + store <2 x i64> %z, <2 x i64>* %Dst + ret void +} + +define void @fun1(<2 x i16>* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun1: +; CHECK: vllezh %v0, 2(%r2) +; CHECK-NEXT: vleh %v0, 0(%r2), 7 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <2 x i16>, <2 x i16>* %Src + %sh = shufflevector <2 x i16> %l, <2 x i16> undef, <2 x i32> + %z = zext <2 x i16> %sh to <2 x i64> + store <2 x i64> %z, <2 x i64>* %Dst + ret void +} + +; Don't use vllez and multiple vle:s. +define void @fun2(<4 x i16>* %Src, <4 x i32>* %Dst) { +; CHECK-LABEL: fun2: +; CHECK: larl %r1, .LCPI2_0 +; CHECK-NEXT: vlrepg %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v0, %v0, %v0, %v1 +; CHECK-NEXT: vuplhh %v0, %v0 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <4 x i16>, <4 x i16>* %Src + %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> + %z = zext <4 x i16> %sh to <4 x i32> + store <4 x i32> %z, <4 x i32>* %Dst + ret void +}