Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -620,6 +620,7 @@ SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue tryVLLEZ(SDValue Op, SelectionDAG &DAG) const; SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, unsigned UnpackHigh) const; SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -5041,8 +5041,90 @@ } SDValue +SystemZTargetLowering::tryVLLEZ(SDValue Op, SelectionDAG &DAG) const { + // Replace ZERO_EXTEND_VECTOR_INREG -> VECTOR_SHUFFLE -> LOAD + // with + // VLE -> VLLEZ + + auto *SVN = dyn_cast(Op.getOperand(0)); + if (!SVN || !SVN->getOperand(1).isUndef()) + return SDValue(); + + // Only generate one additional load (VLE). + EVT OutVT = Op.getValueType(); + EVT InVT = SVN->getValueType(0); + if (OutVT != MVT::v2i64 || (InVT != MVT::v8i16 && InVT != MVT::v16i8)) + return SDValue(); + + // Find the load by looking through any type conversions. + SDValue Src = SVN->getOperand(0); + bool Change = true; + while (Change) { + Change = false; + switch (Src->getOpcode()) { + case ISD::BITCAST: + if (!Src->getOperand(0).getValueType().isVector()) + break; + LLVM_FALLTHROUGH; + case ISD::SCALAR_TO_VECTOR: + Src = Src->getOperand(0); + Change = true; + break; + default: break; + } + } + auto *Load = dyn_cast(Src); + if (!Load || Load->isVolatile()) + return SDValue(); + + // Load the first element and insert it into a zero vector (-> VLLEZ). + EVT NarrowEltVT = InVT.getScalarType(); + unsigned NarrowEltBytes = NarrowEltVT.getSizeInBits() / 8; + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(Load); + const SDValue &BaseAddr = Load->getBasePtr(); + unsigned ByteOffset = SVN->getMaskElt(0) * NarrowEltBytes; + SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, + DAG.getIntPtrConstant(ByteOffset, DL)); + MachineMemOperand *MMO = Load->getMemOperand(); + SDValue Ld0 = DAG.getLoad(NarrowEltVT, DL, Load->getChain(), Address, MMO); + SDValue Zeroes = DAG.getSplatBuildVector(InVT, DL, + DAG.getConstant(0, DL, MVT::i32)); + SDValue Ins0 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InVT, Zeroes, Ld0, + DAG.getVectorIdxConstant(InVT.getVectorNumElements() / 2 - 1, DL)); + + // Load the other element (-> VLE). + ByteOffset = SVN->getMaskElt(1) * NarrowEltBytes; + Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, + DAG.getIntPtrConstant(ByteOffset, DL)); + SDValue Ld1 = DAG.getLoad(NarrowEltVT, DL, Ld0.getValue(1), Address, MMO); + SDValue Ins1 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InVT, Ins0, Ld1, + DAG.getVectorIdxConstant(InVT.getVectorNumElements() - 1, DL)); + + // Update chains. + SDValue LoadCh = SDValue(Load, 1); + if (!LoadCh.use_empty()) { + SDValue TF = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + Ld0.getValue(1), Ld1.getValue(1)); + DAG.ReplaceAllUsesOfValueWith(LoadCh, TF); + SmallVector Ops; + Ops.push_back(LoadCh); + Ops.push_back(Ld1.getValue(1)); + DAG.UpdateNodeOperands(TF.getNode(), Ops); + } + + return DAG.getNode(ISD::BITCAST, DL, OutVT, Ins1); +} + +SDValue SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, unsigned UnpackHigh) const { + if (UnpackHigh == SystemZISD::UNPACKL_HIGH) { + SDValue Res = tryVLLEZ(Op, DAG); + if (Res.getNode()) + return Res; + } + SDValue PackedOp = Op.getOperand(0); EVT OutVT = Op.getValueType(); EVT InVT = PackedOp.getValueType(); Index: llvm/test/CodeGen/SystemZ/vec-move-24.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/vec-move-24.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; +; Test that shuffled and zero extended vector loads gets implemented +; with vllez + vle in the case of a <2 x i64> result. + +define void @fun0(<2 x i8>* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun0: +; CHECK: vllezb %v0, 1(%r2) +; CHECK-NEXT: vleb %v0, 0(%r2), 15 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <2 x i8>, <2 x i8>* %Src + %sh = shufflevector <2 x i8> %l, <2 x i8> undef, <2 x i32> + %z = zext <2 x i8> %sh to <2 x i64> + store <2 x i64> %z, <2 x i64>* %Dst + ret void +} + +define void @fun1(<2 x i16>* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun1: +; CHECK: vllezh %v0, 2(%r2) +; CHECK-NEXT: vleh %v0, 0(%r2), 7 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <2 x i16>, <2 x i16>* %Src + %sh = shufflevector <2 x i16> %l, <2 x i16> undef, <2 x i32> + %z = zext <2 x i16> %sh to <2 x i64> + store <2 x i64> %z, <2 x i64>* %Dst + ret void +} + +; Don't use vllez and multiple vle:s. +define void @fun2(<4 x i16>* %Src, <4 x i32>* %Dst) { +; CHECK-LABEL: fun2: +; CHECK: larl %r1, .LCPI2_0 +; CHECK-NEXT: vlrepg %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v0, %v0, %v0, %v1 +; CHECK-NEXT: vuplhh %v0, %v0 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <4 x i16>, <4 x i16>* %Src + %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> + %z = zext <4 x i16> %sh to <4 x i32> + store <4 x i32> %z, <4 x i32>* %Dst + ret void +}