Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -620,8 +620,8 @@ SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, - unsigned UnpackHigh) const; + SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; bool canTreatAsByteVector(EVT VT) const; Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -4448,6 +4448,85 @@ return Op; } +static bool isZeroOrUndefVector(SDValue N) { + if (N->getOpcode() == ISD::SPLAT_VECTOR) + if (auto *Op = dyn_cast(N->getOperand(0))) + return Op->getZExtValue() == 0; + return N.isUndef() || ISD::isBuildVectorAllZeros(N.getNode()); +} + +// Return the index of the zero/undef vector, or UINT32_MAX if not found. +static uint32_t findZeroOrUndefVectorIdx(SDValue *Ops, unsigned Num) { + for (unsigned I = 0; I < Num ; I++) + if (isZeroOrUndefVector(Ops[I])) + return I; + return UINT32_MAX; +} + +// Keeps track of the bytes that would result after applying one or several +// unpacks. +struct UnpackInfo { + enum { ZEXT_VAL = -2 }; + bool NeedsPermute; + SmallVector Mask; + SDValue SourceOp; + UnpackInfo() : NeedsPermute(false) {} + + // Return true if an unpack can be used to produce the result required by + // Bytes. If the used source operand needs rearrangement before unpacking, + // NeedsPermute is set and Mask holds the required order. ZeroOpIdx is the + // index of the known zero vector in Ops. + bool tryApply(unsigned FromEltSize, const SmallVectorImpl &Bytes, + SDValue *Ops, unsigned ZeroOpIdx) { + // Apply an unpack of FromEltSize on UnpackedBytes and insert ZEXT_VAL on + // the bytes that become zero. + SmallVector UnpackedBytes; + unsigned El = 0; + while (El < SystemZ::VectorBytes / 2) { + for (unsigned i = 0; i < FromEltSize; i++) + UnpackedBytes.push_back(ZEXT_VAL); + for (unsigned i = 0; i < FromEltSize; i++) + UnpackedBytes.push_back(El++); + } + + unsigned OpIdx = ZeroOpIdx == 0 ? 1 : 0; + Mask.assign(SystemZ::VectorBytes, -1); + NeedsPermute = false; + for (unsigned i = 0; i < SystemZ::VectorBytes; ++i) { + if (Bytes[i] == -1) + continue; + unsigned OpNo = Bytes[i] / SystemZ::VectorBytes; + unsigned Byte = Bytes[i] % SystemZ::VectorBytes; + if (OpNo == ZeroOpIdx && UnpackedBytes[i] == ZEXT_VAL) + continue; + if (OpNo != OpIdx || UnpackedBytes[i] == ZEXT_VAL) + return false; + Mask[UnpackedBytes[i]] = Byte; + } + for (unsigned i = 0; i < SystemZ::VectorBytes; ++i) + if (Mask[i] != -1 && Mask[i] != int(i)) { + NeedsPermute = true; + break; + } + SourceOp = Ops[OpIdx]; + return true; + } +}; + +#ifndef NDEBUG +static void dumpBytes(const SmallVectorImpl &Bytes, std::string Msg) { + dbgs() << Msg.c_str() << " { "; + for (unsigned i = 0; i < Bytes.size(); i++) { + if (Bytes[i] == UnpackInfo::ZEXT_VAL) + dbgs() << "Z"; + else + dbgs() << Bytes[i]; + dbgs() << ", "; + } + dbgs() << "}\n"; +} +#endif + // Bytes is a VPERM-like permute vector, except that -1 is used for // undefined bytes. Implement it on operands Ops[0] and Ops[1] using // VSLDB or VPERM. @@ -4475,6 +4554,60 @@ return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2); } +// Detect cases where one of the source operands is a zero or undef vector. +static SDValue tryShuffleWithZeroVec(SelectionDAG &DAG, + const SDLoc &DL, SDValue *Ops, + const SmallVectorImpl &Bytes) { + // Try a single unpack but prefer VPERM over multiple unpacks. + unsigned ZeroOpIdx = findZeroOrUndefVectorIdx(Ops, 2); + if (ZeroOpIdx == UINT32_MAX) + return SDValue(); + UnpackInfo UPI; + unsigned FromEltSize = 1; + for (; FromEltSize <= 4; FromEltSize *= 2) + if (UPI.tryApply(FromEltSize, Bytes, Ops, ZeroOpIdx)) + break; + if (FromEltSize > 4) + return SDValue(); + + if (UPI.NeedsPermute) { + // If source vector is a PERMUTE with no other uses, modify it as needed. + if (UPI.SourceOp->getOpcode() != SystemZISD::PERMUTE || + !UPI.SourceOp->use_empty()) + return SDValue(); + LLVM_DEBUG(dumpBytes(UPI.Mask, "Shuffling with mask before unpacking:");); + SDValue OrigMask = UPI.SourceOp->getOperand(2); + SDValue IndexNodes[SystemZ::VectorBytes]; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + int MaskEl = UPI.Mask[I]; + if (MaskEl < 0 || OrigMask.getOperand(MaskEl).isUndef()) + IndexNodes[I] = DAG.getUNDEF(MVT::i32); + else { + auto *COp = cast(OrigMask.getOperand(MaskEl)); + IndexNodes[I] = DAG.getConstant(COp->getZExtValue(), DL, MVT::i32); + } + } + SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes); + UPI.SourceOp = DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, + UPI.SourceOp.getOperand(0), + UPI.SourceOp.getOperand(1), Op2); + } + + // Build the unpack. + LLVM_DEBUG(dbgs() << "\nUnpacking with element size " << FromEltSize << ":\n"; + Ops[0].dump(); + Ops[1].dump(); + dumpBytes(Bytes, "Given this 'Bytes' mask:");); + unsigned FromBits = FromEltSize * 8; + EVT FromVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits), + SystemZ::VectorBits / FromBits); + SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, FromVT, UPI.SourceOp); + unsigned ToBits = FromBits * 2; + EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ToBits), + SystemZ::VectorBits / ToBits); + return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp); +} + namespace { // Describes a general N-operand vector shuffle. struct GeneralShuffle { @@ -4579,6 +4712,29 @@ if (Ops.size() == 1) Ops.push_back(DAG.getUNDEF(MVT::v16i8)); + // Put a zero vector last to attempt unpacking as the last operation if + // there are more than two operands. + uint32_t ZeroVecOpNo = UINT32_MAX; + if (Ops.size() > 2) { + ZeroVecOpNo = findZeroOrUndefVectorIdx(&Ops[0], Ops.size()); + // Move the zero vector to the last position without rearranging the others. + unsigned LastOpNo = Ops.size() - 1; + if (ZeroVecOpNo != UINT32_MAX && ZeroVecOpNo != LastOpNo) { + SDValue ZeroOp = Ops[ZeroVecOpNo]; + Ops.erase(&Ops[ZeroVecOpNo]); + Ops.push_back(ZeroOp); + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) + if (Bytes[I] >= 0) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + if (OpNo > ZeroVecOpNo) + Bytes[I] -= SystemZ::VectorBytes; + else if (OpNo == ZeroVecOpNo) + Bytes[I] += SystemZ::VectorBytes * (LastOpNo - ZeroVecOpNo); + } + ZeroVecOpNo = LastOpNo; + } + } + // Create a tree of shuffles, deferring root node until after the loop. // Try to redistribute the undefined elements of non-root nodes so that // the non-root shuffles match something like a pack or merge, then adjust @@ -4589,8 +4745,14 @@ // In the best case this redistribution will lead to the whole tree // using packs and merges. It should rarely be a loss in other cases. unsigned Stride = 1; - for (; Stride * 2 < Ops.size(); Stride *= 2) { + // The Zero vector needs to be handled at last. Do this by skipping it + // below while doing one more iteration with the rest leaving only one + // other result. + unsigned ItersFactor = (ZeroVecOpNo == UINT32_MAX ? 2 : 1); + for (; Stride * ItersFactor < Ops.size(); Stride *= 2) { for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) { + if (I + Stride == ZeroVecOpNo) + continue; SDValue SubOps[] = { Ops[I], Ops[I + Stride] }; // Create a mask for just these two operands. @@ -4629,21 +4791,25 @@ } // Now we just have 2 inputs. Put the second operand in Ops[1]. - if (Stride > 1) { - Ops[1] = Ops[Stride]; + unsigned RHSOpNo = ZeroVecOpNo == UINT32_MAX ? Stride :ZeroVecOpNo; + if (RHSOpNo > 1) { + Ops[1] = Ops[RHSOpNo]; for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) if (Bytes[I] >= int(SystemZ::VectorBytes)) - Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes; + Bytes[I] -= (RHSOpNo - 1) * SystemZ::VectorBytes; } - // Look for an instruction that can do the permute without resorting // to VPERM. unsigned OpNo0, OpNo1; SDValue Op; if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1)) Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]); - else - Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes); + else { + Op = tryShuffleWithZeroVec(DAG, DL, &Ops[0], Bytes); + if (!Op.getNode()) + Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes); + } + return DAG.getNode(ISD::BITCAST, DL, VT, Op); } @@ -5041,9 +5207,8 @@ return DAG.getNode(ISD::BITCAST, DL, VT, Res); } -SDValue -SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, - unsigned UnpackHigh) const { +SDValue SystemZTargetLowering:: +lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const { SDValue PackedOp = Op.getOperand(0); EVT OutVT = Op.getValueType(); EVT InVT = PackedOp.getValueType(); @@ -5053,11 +5218,39 @@ FromBits *= 2; EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits), SystemZ::VectorBits / FromBits); - PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp); + PackedOp = + DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp); } while (FromBits != ToBits); return PackedOp; } +// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector. +SDValue SystemZTargetLowering:: +lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const { + SDValue PackedOp = Op.getOperand(0); + SDLoc DL(Op); + EVT OutVT = Op.getValueType(); + EVT InVT = PackedOp.getValueType(); + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned OutNumElts = OutVT.getVectorNumElements(); + unsigned NumInPerOut = InNumElts / OutNumElts; + + SDValue ZeroVec = + DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType())); + + SmallVector Mask(InNumElts); + unsigned ZeroVecElt = InNumElts; + for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) { + unsigned MaskElt = PackedElt * NumInPerOut; + unsigned End = MaskElt + NumInPerOut - 1; + for (; MaskElt < End; MaskElt++) + Mask[MaskElt] = ZeroVecElt++; + Mask[MaskElt] = PackedElt; + } + SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask); + return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf); +} + SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const { // Look for cases where a vector shift can use the *_BY_SCALAR form. @@ -5223,9 +5416,9 @@ case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::SIGN_EXTEND_VECTOR_INREG: - return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH); + return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG); case ISD::ZERO_EXTEND_VECTOR_INREG: - return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH); + return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG); case ISD::SHL: return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR); case ISD::SRL: Index: llvm/test/CodeGen/SystemZ/vec-move-16.ll =================================================================== --- llvm/test/CodeGen/SystemZ/vec-move-16.ll +++ llvm/test/CodeGen/SystemZ/vec-move-16.ll @@ -40,9 +40,11 @@ ; Test a v4i8->v4i32 extension. define <4 x i32> @f5(<4 x i8> *%ptr) { ; CHECK-LABEL: f5: +; CHECK: larl %r1, .LCPI4_0 ; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) -; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]] -; CHECK: vuplhh %v24, [[REG2]] +; CHECK: vl %v1, 0(%r1), 3 +; CHECK: vgbm %v2, 0 +; CHECK: vperm %v24, %v2, [[REG1]], %v1 ; CHECK: br %r14 %val = load <4 x i8>, <4 x i8> *%ptr %ret = zext <4 x i8> %val to <4 x i32> @@ -71,10 +73,11 @@ ; Test a v2i8->v2i64 extension. define <2 x i64> @f8(<2 x i8> *%ptr) { ; CHECK-LABEL: f8: -; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2) -; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]] -; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]] -; CHECK: vuplhf %v24, [[REG3]] +; CHECK: larl %r1, .LCPI7_0 +; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vl %v1, 0(%r1), 3 +; CHECK: vgbm %v2, 0 +; CHECK: vperm %v24, %v2, [[REG1]], %v1 ; CHECK: br %r14 %val = load <2 x i8>, <2 x i8> *%ptr %ret = zext <2 x i8> %val to <2 x i64> @@ -84,9 +87,11 @@ ; Test a v2i16->v2i64 extension. define <2 x i64> @f9(<2 x i16> *%ptr) { ; CHECK-LABEL: f9: -; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) -; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]] -; CHECK: vuplhf %v24, [[REG2]] +; CHECK: larl %r1, .LCPI8_0 +; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vl %v1, 0(%r1), 3 +; CHECK: vgbm %v2, 0 +; CHECK: vperm %v24, %v2, [[REG1]], %v1 ; CHECK: br %r14 %val = load <2 x i16>, <2 x i16> *%ptr %ret = zext <2 x i16> %val to <2 x i64> Index: llvm/test/CodeGen/SystemZ/vec-move-23.ll =================================================================== --- llvm/test/CodeGen/SystemZ/vec-move-23.ll +++ llvm/test/CodeGen/SystemZ/vec-move-23.ll @@ -68,9 +68,10 @@ define void @fun4(<2 x i8> %Src, <2 x double>* %Dst) { ; CHECK-LABEL: fun4: -; CHECK: vuplhb %v0, %v24 -; CHECK-NEXT: vuplhh %v0, %v0 -; CHECK-NEXT: vuplhf %v0, %v0 +; CHECK: larl %r1, .LCPI4_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v0, %v1, %v24, %v0 ; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0 ; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 @@ -81,8 +82,10 @@ define void @fun5(<2 x i16> %Src, <2 x double>* %Dst) { ; CHECK-LABEL: fun5: -; CHECK: vuplhh %v0, %v24 -; CHECK-NEXT: vuplhf %v0, %v0 +; CHECK: larl %r1, .LCPI5_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v0, %v1, %v24, %v0 ; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0 ; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 Index: llvm/test/CodeGen/SystemZ/vec-move-24.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/vec-move-24.ll @@ -0,0 +1,49 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; +; Test that vperm is not used if a single unpack is enough. + +define <4 x i32> @fun0(<4 x i32>* %Src) nounwind { +; CHECK-LABEL: fun0: +; CHECK-NOT: vperm + %tmp = load <4 x i32>, <4 x i32>* %Src + %tmp2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> + ret <4 x i32> %tmp2 +} + +define void @fun1(i8 %Src, <32 x i8>* %Dst) nounwind { +; CHECK-LABEL: fun1: +; CHECK-NOT: vperm + %I0 = insertelement <16 x i8> undef, i8 %Src, i32 0 + %I1 = insertelement <16 x i8> %I0, i8 %Src, i32 1 + %I2 = insertelement <16 x i8> %I1, i8 %Src, i32 2 + %I3 = insertelement <16 x i8> %I2, i8 %Src, i32 3 + %I4 = insertelement <16 x i8> %I3, i8 %Src, i32 4 + %I5 = insertelement <16 x i8> %I4, i8 %Src, i32 5 + %I6 = insertelement <16 x i8> %I5, i8 %Src, i32 6 + %I7 = insertelement <16 x i8> %I6, i8 %Src, i32 7 + %I8 = insertelement <16 x i8> %I7, i8 %Src, i32 8 + %I9 = insertelement <16 x i8> %I8, i8 %Src, i32 9 + %I10 = insertelement <16 x i8> %I9, i8 %Src, i32 10 + %I11 = insertelement <16 x i8> %I10, i8 %Src, i32 11 + %I12 = insertelement <16 x i8> %I11, i8 %Src, i32 12 + %I13 = insertelement <16 x i8> %I12, i8 %Src, i32 13 + %I14 = insertelement <16 x i8> %I13, i8 %Src, i32 14 + %I15 = insertelement <16 x i8> %I14, i8 %Src, i32 15 + + %tmp = shufflevector <16 x i8> zeroinitializer, + <16 x i8> %I15, + <32 x i32> + %tmp9 = shufflevector <32 x i8> undef, + <32 x i8> %tmp, + <32 x i32> + + store <32 x i8> %tmp9, <32 x i8>* %Dst + ret void +} + Index: llvm/test/CodeGen/SystemZ/vec-zext.ll =================================================================== --- llvm/test/CodeGen/SystemZ/vec-zext.ll +++ llvm/test/CodeGen/SystemZ/vec-zext.ll @@ -1,5 +1,5 @@ -; Test that vector zexts are done efficently with unpack instructions also in -; case of fewer elements than allowed, e.g. <2 x i32>. +; Test that vector zexts are done efficently also in case of fewer elements +; than allowed, e.g. <2 x i32>. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s @@ -14,8 +14,10 @@ define <2 x i32> @fun2(<2 x i8> %val1) { ; CHECK-LABEL: fun2: -; CHECK: vuplhb %v0, %v24 -; CHECK-NEXT: vuplhh %v24, %v0 +; CHECK: larl %r1, .LCPI1_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v24, %v1, %v24, %v0 ; CHECK-NEXT: br %r14 %z = zext <2 x i8> %val1 to <2 x i32> ret <2 x i32> %z @@ -23,9 +25,10 @@ define <2 x i64> @fun3(<2 x i8> %val1) { ; CHECK-LABEL: fun3: -; CHECK: vuplhb %v0, %v24 -; CHECK-NEXT: vuplhh %v0, %v0 -; CHECK-NEXT: vuplhf %v24, %v0 +; CHECK: larl %r1, .LCPI2_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v24, %v1, %v24, %v0 ; CHECK-NEXT: br %r14 %z = zext <2 x i8> %val1 to <2 x i64> ret <2 x i64> %z @@ -41,8 +44,10 @@ define <2 x i64> @fun5(<2 x i16> %val1) { ; CHECK-LABEL: fun5: -; CHECK: vuplhh %v0, %v24 -; CHECK-NEXT: vuplhf %v24, %v0 +; CHECK: larl %r1, .LCPI4_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v24, %v1, %v24, %v0 ; CHECK-NEXT: br %r14 %z = zext <2 x i16> %val1 to <2 x i64> ret <2 x i64> %z @@ -66,8 +71,10 @@ define <4 x i32> @fun8(<4 x i8> %val1) { ; CHECK-LABEL: fun8: -; CHECK: vuplhb %v0, %v24 -; CHECK-NEXT: vuplhh %v24, %v0 +; CHECK: larl %r1, .LCPI7_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vperm %v24, %v1, %v24, %v0 ; CHECK-NEXT: br %r14 %z = zext <4 x i8> %val1 to <4 x i32> ret <4 x i32> %z