Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -363,6 +363,9 @@ // Element swapping load/store. Same operands as regular load/store. VLER, VSTER, + // Zero all bits of vector and load logical element. + VLLEZ, + // Prefetch from the second operand using the 4-bit control code in // the first operand. The code is 1 for a load prefetch and 2 for // a store prefetch. @@ -620,6 +623,7 @@ SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue tryVLLEZ(SDValue Op, SelectionDAG &DAG) const; SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, unsigned UnpackHigh) const; SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; @@ -642,6 +646,7 @@ SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFP_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineUINT_TO_FP(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const; Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -641,6 +641,7 @@ setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::STRICT_FP_ROUND); setTargetDAGCombine(ISD::FP_EXTEND); + setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::STRICT_FP_EXTEND); setTargetDAGCombine(ISD::BSWAP); setTargetDAGCombine(ISD::SDIV); @@ -5039,8 +5040,89 @@ } SDValue +SystemZTargetLowering::tryVLLEZ(SDValue Op, SelectionDAG &DAG) const { + // Replace ZERO_EXTEND_VECTOR_INREG -> VECTOR_SHUFFLE -> LOAD + // with + // VLE -> VLLEZ + + auto *SVN = dyn_cast(Op.getOperand(0)); + if (!SVN || !SVN->getOperand(1).isUndef()) + return SDValue(); + + // Only generate one additional load (VLE). + EVT OutVT = Op.getValueType(); + EVT InVT = SVN->getValueType(0); + if (OutVT != MVT::v2i64 || (InVT != MVT::v8i16 && InVT != MVT::v16i8)) + return SDValue(); + + // Find the load by looking through any type conversions. + SDValue Src = SVN->getOperand(0); + bool Change = true; + while (Change) { + Change = false; + switch (Src->getOpcode()) { + case ISD::BITCAST: + if (!Src->getOperand(0).getValueType().isVector()) + break; + LLVM_FALLTHROUGH; + case ISD::SCALAR_TO_VECTOR: + Src = Src->getOperand(0); + Change = true; + break; + default: break; + } + } + auto *Load = dyn_cast(Src); + if (!Load || Load->isVolatile()) + return SDValue(); + + // First do the VLLEZ, which will zero all other bits of the vector. + EVT NarrowEltVT = InVT.getScalarType(); + unsigned NarrowEltBytes = NarrowEltVT.getSizeInBits() / 8; + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(Load); + const SDValue &BaseAddr = Load->getBasePtr(); + unsigned ByteOffset = SVN->getMaskElt(0) * NarrowEltBytes; + SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, + DAG.getIntPtrConstant(ByteOffset, DL)); + SDVTList Tys = DAG.getVTList(InVT, MVT::Other); + SDValue Ops[] = { Load->getChain(), Address }; + MachineMemOperand *MMO = Load->getMemOperand(); + SDValue VLLEZ = DAG.getMemIntrinsicNode(SystemZISD::VLLEZ, DL, Tys, Ops, + NarrowEltVT, MMO); + // Load the other element. + ByteOffset = SVN->getMaskElt(1) * NarrowEltBytes; + Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, + DAG.getIntPtrConstant(ByteOffset, DL)); + SDValue EltLd = DAG.getLoad(NarrowEltVT, DL, VLLEZ.getValue(1), Address, MMO); + SDValue InsIdx = DAG.getVectorIdxConstant(InVT.getVectorNumElements() - 1, DL); + SDValue InsVec = + DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InVT, VLLEZ, EltLd, InsIdx); + + // Update chains. + SDValue LoadCh = SDValue(Load, 1); + if (!LoadCh.use_empty()) { + SDValue TF = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + VLLEZ.getValue(1), EltLd.getValue(1)); + DAG.ReplaceAllUsesOfValueWith(LoadCh, TF); + SmallVector Ops; + Ops.push_back(LoadCh); + Ops.push_back(EltLd.getValue(1)); + DAG.UpdateNodeOperands(TF.getNode(), Ops); + } + + return DAG.getNode(ISD::BITCAST, DL, OutVT, InsVec); +} + +SDValue SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, unsigned UnpackHigh) const { + if (UnpackHigh == SystemZISD::UNPACKL_HIGH) { + SDValue Res = tryVLLEZ(Op, DAG); + if (Res.getNode()) + return Res; + } + SDValue PackedOp = Op.getOperand(0); EVT OutVT = Op.getValueType(); EVT InVT = PackedOp.getValueType(); @@ -5447,6 +5529,7 @@ OPCODE(STRV); OPCODE(VLER); OPCODE(VSTER); + OPCODE(VLLEZ); OPCODE(PREFETCH); } return nullptr; @@ -6081,6 +6164,29 @@ return SDValue(); } +SDValue SystemZTargetLowering::combineUINT_TO_FP( + SDNode *N, DAGCombinerInfo &DCI) const { + if (DCI.Level != BeforeLegalizeTypes) + return SDValue(); + EVT OutVT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + SDValue Op = N->getOperand(0); + unsigned OutScalarBits = OutVT.getScalarSizeInBits(); + unsigned InScalarBits = Op->getValueType(0).getScalarSizeInBits(); + + // Insert a zero_extend before type-legalization to avoid scalarization, e.g.: + // v2f64 = uint_to_fp v2i16 + // => + // v2f64 = uint_to_fp (v2i64 zero_extend v2i16) + if (OutVT.isVector() && OutScalarBits > InScalarBits) { + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(OutVT.getScalarSizeInBits()), + OutVT.getVectorNumElements()); + SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), ExtVT, Op); + return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), OutVT, ExtOp); + } + return SDValue(); +} + SDValue SystemZTargetLowering::combineBSWAP( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -6408,6 +6514,7 @@ case ISD::FP_ROUND: return combineFP_ROUND(N, DCI); case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: return combineFP_EXTEND(N, DCI); + case ISD::UINT_TO_FP: return combineUINT_TO_FP(N, DCI); case ISD::BSWAP: return combineBSWAP(N, DCI); case SystemZISD::BR_CCMASK: return combineBR_CCMASK(N, DCI); case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI); Index: llvm/lib/Target/SystemZ/SystemZOperators.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZOperators.td +++ llvm/lib/Target/SystemZ/SystemZOperators.td @@ -292,6 +292,8 @@ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def z_storeeswap : SDNode<"SystemZISD::VSTER", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def z_vllez : SDNode<"SystemZISD::VLLEZ", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def z_tdc : SDNode<"SystemZISD::TDC", SDT_ZTest>; @@ -811,17 +813,25 @@ // Load a scalar and insert it into the low element of the high i64 of a // zeroed vector. -class z_vllez +class vllez_insertpat : PatFrag<(ops node:$addr), (z_vector_insert immAllZerosV, (scalartype (load node:$addr)), (i32 index))>; -def z_vllezi8 : z_vllez; -def z_vllezi16 : z_vllez; -def z_vllezi32 : z_vllez; + +class vllez_patterns + : PatFrags<(ops node:$addr), + [(z_vector_insert immAllZerosV, + (scalartype (load node:$addr)), (i32 index)), + (z_vllez node:$addr)]>; + +def z_vllezi8 : vllez_patterns; +def z_vllezi16 : vllez_patterns; +def z_vllezi32 : vllez_patterns; def z_vllezi64 : PatFrags<(ops node:$addr), [(z_vector_insert immAllZerosV, (i64 (load node:$addr)), (i32 0)), - (z_join_dwords (i64 (load node:$addr)), (i64 0))]>; + (z_join_dwords (i64 (load node:$addr)), (i64 0)), + (z_vllez node:$addr)]>; // We use high merges to form a v4f32 from four f32s. Propagating zero // into all elements but index 1 gives this expression. def z_vllezf32 : PatFrag<(ops node:$addr), @@ -840,7 +850,7 @@ immAllZerosV)>; // Similarly for the high element of a zeroed vector. -def z_vllezli32 : z_vllez; +def z_vllezli32 : vllez_insertpat; def z_vllezlf32 : PatFrag<(ops node:$addr), (z_merge_high (v2i64 @@ -853,9 +863,9 @@ (bitconvert (v4f32 immAllZerosV))))>; // Byte-swapped variants. -def z_vllebrzi16 : z_vllez; -def z_vllebrzi32 : z_vllez; -def z_vllebrzli32 : z_vllez; +def z_vllebrzi16 : vllez_insertpat; +def z_vllebrzi32 : vllez_insertpat; +def z_vllebrzli32 : vllez_insertpat; def z_vllebrzi64 : PatFrags<(ops node:$addr), [(z_vector_insert immAllZerosV, (i64 (z_loadbswap64 node:$addr)), Index: llvm/test/CodeGen/SystemZ/vec-move-23.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/vec-move-23.ll @@ -0,0 +1,140 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s -check-prefixes=CHECK,Z14 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s -check-prefixes=CHECK,Z15 +; +; Check that uitofp conversions from a narrower type get a vector zero extend. +; +; Also test that shuffled and zero extended vector loads gets implemented +; with vllez + vle in the case of a <2 x i64> result. + +define void @fun1(<2 x i16>* %Src, <2 x double>* %Dst) { +; CHECK-LABEL: fun1: +; CHECK: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vuplhh %v0, %v0 +; CHECK-NEXT: vuplhf %v0, %v0 +; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <2 x i16>, <2 x i16>* %Src + %c = uitofp <2 x i16> %l to <2 x double> + store <2 x double> %c, <2 x double>* %Dst + ret void +} + +define void @fun2(<2 x i16>* %Src, <2 x double>* %Dst) { +; CHECK-LABEL: fun2: +; CHECK: vllezh %v0, 2(%r2) +; CHECK-NEXT: vleh %v0, 0(%r2), 7 +; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <2 x i16>, <2 x i16>* %Src + %sh = shufflevector <2 x i16> %l, <2 x i16> undef, <2 x i32> + %c = uitofp <2 x i16> %sh to <2 x double> + store <2 x double> %c, <2 x double>* %Dst + ret void +} + +define void @fun3(<4 x i16>* %Src, <4 x double>* %Dst) { +; CHECK-LABEL: fun3: +; CHECK: vllezh %v0, 4(%r2) +; CHECK-NEXT: vleh %v0, 2(%r2), 7 +; CHECK-NEXT: vllezh %v1, 6(%r2) +; CHECK-NEXT: vleh %v1, 0(%r2), 7 +; CHECK-NEXT: vcdlgb %v1, %v1, 0, 0 +; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0 +; CHECK-NEXT: vst %v0, 16(%r3), 4 +; CHECK-NEXT: vst %v1, 0(%r3), 4 +; CHECK-NEXT: br %r14 + %l = load <4 x i16>, <4 x i16>* %Src + %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> + %c = uitofp <4 x i16> %sh to <4 x double> + store <4 x double> %c, <4 x double>* %Dst + ret void +} + +define void @fun4(<4 x i8>* %Src, <4 x double>* %Dst) { +; CHECK-LABEL: fun4: +; CHECK: vllezb %v0, 2(%r2) +; CHECK-NEXT: vleb %v0, 1(%r2), 15 +; CHECK-NEXT: vllezb %v1, 3(%r2) +; CHECK-NEXT: vleb %v1, 0(%r2), 15 +; CHECK-NEXT: vcdlgb %v1, %v1, 0, 0 +; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0 +; CHECK-NEXT: vst %v0, 16(%r3), 4 +; CHECK-NEXT: vst %v1, 0(%r3), 4 +; CHECK-NEXT: br %r14 + %l = load <4 x i8>, <4 x i8>* %Src + %sh = shufflevector <4 x i8> %l, <4 x i8> undef, <4 x i32> + %c = uitofp <4 x i8> %sh to <4 x double> + store <4 x double> %c, <4 x double>* %Dst + ret void +} + +define void @fun5(<4 x i16>* %Src, <4 x float>* %Dst) { +; CHECK-LABEL: fun5: +; Z14: larl %r1, .LCPI4_0 +; Z14-NEXT: vlrepg %v0, 0(%r2) +; Z14-NEXT: vl %v1, 0(%r1), 3 +; Z14-NEXT: vperm %v0, %v0, %v0, %v1 +; Z14-NEXT: vuplhh %v0, %v0 +; Z14-NEXT: vlgvf %r0, %v0, 3 +; Z14-NEXT: celfbr %f1, 0, %r0, 0 +; Z14-NEXT: vlgvf %r0, %v0, 2 +; Z14-NEXT: celfbr %f2, 0, %r0, 0 +; Z14-NEXT: vlgvf %r0, %v0, 1 +; Z14-NEXT: vmrhf %v1, %v2, %v1 +; Z14-NEXT: celfbr %f2, 0, %r0, 0 +; Z14-NEXT: vlgvf %r0, %v0, 0 +; Z14-NEXT: celfbr %f0, 0, %r0, 0 +; Z14-NEXT: vmrhf %v0, %v0, %v2 +; Z14-NEXT: vmrhg %v0, %v0, %v1 +; Z14-NEXT: vst %v0, 0(%r3), 3 +; Z14-NEXT: br %r14 + +; Z15: larl %r1, .LCPI4_0 +; Z15-NEXT: vlrepg %v0, 0(%r2) +; Z15-NEXT: vl %v1, 0(%r1), 3 +; Z15-NEXT: vperm %v0, %v0, %v0, %v1 +; Z15-NEXT: vuplhh %v0, %v0 +; Z15-NEXT: vcelfb %v0, %v0, 0, 0 +; Z15-NEXT: vst %v0, 0(%r3), 3 +; Z15-NEXT: br %r14 + %l = load <4 x i16>, <4 x i16>* %Src + %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> + %c = uitofp <4 x i16> %sh to <4 x float> + store <4 x float> %c, <4 x float>* %Dst + ret void +} + +define void @fun6(<4 x i16>* %Src, <4 x i64>* %Dst) { +; CHECK-LABEL: fun6: +; CHECK: vllezh %v0, 6(%r2) +; CHECK-LABEL: vleh %v0, 0(%r2), 7 +; CHECK-LABEL: vllezh %v1, 4(%r2) +; CHECK-LABEL: vleh %v1, 2(%r2), 7 +; CHECK-LABEL: vst %v1, 16(%r3), 4 +; CHECK-LABEL: vst %v0, 0(%r3), 4 +; CHECK-LABEL: br %r14 + %l = load <4 x i16>, <4 x i16>* %Src + %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> + %z = zext <4 x i16> %sh to <4 x i64> + store <4 x i64> %z, <4 x i64>* %Dst + ret void +} + +; Don't use vllez and multiple vle:s. +define void @fun7(<4 x i16>* %Src, <4 x i32>* %Dst) { +; CHECK-LABEL: fun7: +; CHECK: larl %r1, .LCPI6_0 +; CHECK-LABEL: vlrepg %v0, 0(%r2) +; CHECK-LABEL: vl %v1, 0(%r1), 3 +; CHECK-LABEL: vperm %v0, %v0, %v0, %v1 +; CHECK-LABEL: vuplhh %v0, %v0 +; CHECK-LABEL: vst %v0, 0(%r3), 3 +; CHECK-LABEL: br %r14 + %l = load <4 x i16>, <4 x i16>* %Src + %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> + %z = zext <4 x i16> %sh to <4 x i32> + store <4 x i32> %z, <4 x i32>* %Dst + ret void +}