Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -4461,6 +4461,26 @@ return false; } +LoadSDNode *getVectorLoad(SDValue N) { + // Find the load by looking through any type conversions. + bool Change = true; + while (Change) { + Change = false; + switch (N->getOpcode()) { + case ISD::BITCAST: // XXX Needed? + if (!N->getOperand(0).getValueType().isVector()) + break; + LLVM_FALLTHROUGH; + case ISD::SCALAR_TO_VECTOR: + N = N->getOperand(0); + Change = true; + break; + default: break; + } + } + return dyn_cast(N); +} + // Keeps track of the bytes that would result after applying one or several // unpacks. struct UnpackInfo { @@ -4582,6 +4602,96 @@ if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1)) return SDValue(); + // Try VLLEZ (+ VLE) (patch "in progress") + bool NeedsZero = false; + unsigned LoadOpNo = UINT32_MAX; + struct ElementLoad { + unsigned StartPos; + unsigned StartByte; + unsigned Count; + ElementLoad() : StartPos(0), StartByte(0), Count(0) {} + } Loads[2]; + unsigned CurrLd = 0; + for (unsigned i = 0; i < SystemZ::VectorBytes; i++) { + if (Bytes[i] == - 1) + continue; + unsigned OpNo = Bytes[i] / SystemZ::VectorBytes; + unsigned Byte = Bytes[i] % SystemZ::VectorBytes; + if (isByteZero(Ops[OpNo], Byte)) { + NeedsZero = true; + continue; + } + if (LoadOpNo == UINT32_MAX) + LoadOpNo = OpNo; + else if (OpNo != LoadOpNo || CurrLd == 2) { + LoadOpNo = UINT32_MAX; + break; + } + ElementLoad &L = Loads[CurrLd++]; + L.StartPos = i++; + L.StartByte = Byte; + while (i < SystemZ::VectorBytes && Bytes[i - 1] + 1 == Bytes[i] && + Bytes[i] / SystemZ::VectorBytes == OpNo) + i++; + L.Count = i - L.StartPos; + } + LoadSDNode *Load = LoadOpNo != UINT32_MAX ? getVectorLoad(Ops[LoadOpNo]) + : nullptr; + unsigned VLLEZLoad = UINT32_MAX; + unsigned VLELoad = UINT32_MAX; + for (unsigned i = 0; i < CurrLd; i++) + if (Loads[i].StartPos + Loads[i].Count - 1 == 7) { + VLLEZLoad = i; + VLELoad = i == 0 ? 1 : 0; + break; + } + if (Load != nullptr && !Load->isVolatile() && NeedsZero && + VLLEZLoad != UINT32_MAX && + CurrLd == 2 && + Loads[0].Count == Loads[1].Count && + Loads[0].Count <= 2 && + Loads[VLELoad].StartPos + Loads[VLELoad].Count - 1 == 15) { + dbgs() << "VLLEZ!\n"; + + // Load the first element and insert it into a zero vector (-> VLLEZ). + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + SDLoc DL(Load); + MVT ScalarVT = MVT::getIntegerVT(Loads[VLLEZLoad].Count * 8); + MVT VecVT = MVT::getVectorVT(ScalarVT, + SystemZ::VectorBytes / Loads[VLLEZLoad].Count); + const SDValue &BaseAddr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + SDValue Zeroes = DAG.getSplatBuildVector(VecVT, DL, + DAG.getConstant(0, DL, MVT::i32)); + SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, + DAG.getIntPtrConstant(Loads[VLLEZLoad].StartByte, DL)); + SDValue Ld0 = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Load->getChain(), + Address, ScalarVT, MMO); + SDValue Ins0 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Zeroes, Ld0, + DAG.getVectorIdxConstant(VecVT.getVectorNumElements() / 2 - 1, DL)); + + // Load the other element (-> VLE). + Address = DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, + DAG.getIntPtrConstant(Loads[VLELoad].StartByte, DL)); + SDValue Ld1 = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld0.getValue(1), + Address, ScalarVT, MMO); + SDValue Ins1 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Ins0, Ld1, + DAG.getVectorIdxConstant(VecVT.getVectorNumElements() - 1, DL)); + + // Update chains. + SDValue LoadCh = SDValue(Load, 1); + if (!LoadCh.use_empty()) { + SDValue TF = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + Ld0.getValue(1), Ld1.getValue(1)); + DAG.ReplaceAllUsesOfValueWith(LoadCh, TF); + SmallVector Ops; + Ops.push_back(LoadCh); + Ops.push_back(Ld1.getValue(1)); + DAG.UpdateNodeOperands(TF.getNode(), Ops); + } + return Ins1; + } + // Try unpack(s) UnpackInfo Best; for (unsigned Start = 1; Start <= 4; Start *= 2) { Index: llvm/test/CodeGen/SystemZ/vec-move-25.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/vec-move-25.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; +; Test that shuffled and zero extended vector loads gets implemented +; with vllez + vle in the case of a <2 x i64> result. + +; define void @fun0(<2 x i8>* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun0: +; CHECK: vllezb %v0, 1(%r2) +; CHECK-NEXT: vleb %v0, 0(%r2), 15 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <2 x i8>, <2 x i8>* %Src + %sh = shufflevector <2 x i8> %l, <2 x i8> undef, <2 x i32> + %z = zext <2 x i8> %sh to <2 x i64> + store <2 x i64> %z, <2 x i64>* %Dst + ret void +} + +define void @fun1(<2 x i16>* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun1: +; CHECK: vllezh %v0, 2(%r2) +; CHECK-NEXT: vleh %v0, 0(%r2), 7 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <2 x i16>, <2 x i16>* %Src +; %sh = shufflevector <2 x i16> %l, <2 x i16> undef, <2 x i32> + %z = zext <2 x i16> %l to <2 x i64> + store <2 x i64> %z, <2 x i64>* %Dst + ret void +} + +; Don't use vllez and multiple vle:s. +define void @fun2(<4 x i16>* %Src, <4 x i32>* %Dst) { +; CHECK-LABEL: fun2: +; CHECK: larl %r1, .LCPI2_0 +; CHECK-NEXT: vlrepg %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v0, %v0, %v0, %v1 +; CHECK-NEXT: vuplhh %v0, %v0 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %l = load <4 x i16>, <4 x i16>* %Src + %sh = shufflevector <4 x i16> %l, <4 x i16> undef, <4 x i32> + %z = zext <4 x i16> %sh to <4 x i32> + store <4 x i32> %z, <4 x i32>* %Dst + ret void +}