Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4180,9 +4180,23 @@ if (SourceVecs.size() > 2) return SDValue(); - SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; int VEXTOffsets[2] = { 0, 0 }; int OffsetMultipliers[2] = { 1, 1 }; + int ResMultiplier = 1; + // Find out the smallest element size among result and two sources, and use + // it as element size to build the shuffle_vector. + EVT SmallestEltTy = VT.getVectorElementType(); + for (unsigned i = 0; i < SourceVecs.size(); ++i) { + EVT SrcEltTy = SourceVecs[i].getValueType().getVectorElementType(); + if (SrcEltTy.bitsLT(SmallestEltTy)) { + ResMultiplier *= SmallestEltTy.getSizeInBits() / SrcEltTy.getSizeInBits(); + SmallestEltTy = SrcEltTy; + } + } + EVT ShuffleVT = + EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, + VT.getSizeInBits() / SmallestEltTy.getSizeInBits()); + SDValue ShuffleSrcs[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)}; // This loop extracts the usage patterns of the source vectors // and prepares appropriate SDValues for a shuffle if possible. @@ -4190,15 +4204,15 @@ unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements(); SDValue CurSource = SourceVecs[i]; if (SourceVecs[i].getValueType().getVectorElementType() != - VT.getVectorElementType()) { - // It may hit this case if SourceVecs[i] is AssertSext/AssertZext. - // Then bitcast it to the vector which holds asserted element type, - // and record the multiplier of element width between SourceVecs and - // Build_vector which is needed to extract the correct lanes later. - EVT CastVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - SourceVecs[i].getValueSizeInBits() / - VT.getVectorElementType().getSizeInBits()); + ShuffleVT.getVectorElementType()) { + // As ShuffleVT holds smallest element size, it may hit here only if + // the element type of SourceVecs is bigger than that of ShuffleVT. + // Adjust the element size of SourceVecs to match ShuffleVT, and record + // the multipliers. + EVT CastVT = EVT::getVectorVT( + *DAG.getContext(), ShuffleVT.getVectorElementType(), + SourceVecs[i].getValueSizeInBits() / + ShuffleVT.getVectorElementType().getSizeInBits()); CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]); OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts; @@ -4207,7 +4221,7 @@ MinElts[i] *= OffsetMultipliers[i]; } - if (CurSource.getValueType() == VT) { + if (CurSource.getValueType() == ShuffleVT) { // No VEXT necessary ShuffleSrcs[i] = CurSource; VEXTOffsets[i] = 0; @@ -4215,8 +4229,9 @@ } else if (NumSrcElts < NumElts) { // We can pad out the smaller vector for free, so if it's part of a // shuffle... - ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource, - DAG.getUNDEF(CurSource.getValueType())); + ShuffleSrcs[i] = + DAG.getNode(ISD::CONCAT_VECTORS, dl, ShuffleVT, CurSource, + DAG.getUNDEF(CurSource.getValueType())); continue; } @@ -4233,50 +4248,56 @@ if (MinElts[i] >= NumElts) { // The extraction can just take the second half VEXTOffsets[i] = NumElts; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(NumElts)); + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ShuffleVT, + CurSource, DAG.getIntPtrConstant(NumElts)); } else if (MaxElts[i] < NumElts) { // The extraction can just take the first half VEXTOffsets[i] = 0; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(0)); + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ShuffleVT, + CurSource, DAG.getIntPtrConstant(0)); } else { // An actual VEXT is needed VEXTOffsets[i] = MinElts[i]; - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(0)); - SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(NumElts)); + SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ShuffleVT, + CurSource, DAG.getIntPtrConstant(0)); + SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ShuffleVT, + CurSource, DAG.getIntPtrConstant(NumElts)); unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1); - ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2, - DAG.getConstant(Imm, MVT::i32)); + ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, ShuffleVT, VEXTSrc1, + VEXTSrc2, DAG.getConstant(Imm, MVT::i32)); } } SmallVector Mask; - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) { + for (unsigned i = 0; i < ShuffleVT.getVectorNumElements(); ++i) { + if (i % ResMultiplier) { Mask.push_back(-1); continue; } - - SDValue ExtractVec = Entry.getOperand(0); - int ExtractElt = - cast(Op.getOperand(i).getOperand(1))->getSExtValue(); - if (ExtractVec == SourceVecs[0]) { - Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]); + unsigned OpNum = i / ResMultiplier; + SDValue Entry = Op.getOperand(OpNum); + if (Entry.getOpcode() == ISD::UNDEF) { + Mask.push_back(-1); } else { - Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts - - VEXTOffsets[1]); + SDValue ExtractVec = Entry.getOperand(0); + int ExtractElt = cast(Op.getOperand(OpNum).getOperand(1)) + ->getSExtValue(); + if (ExtractVec == SourceVecs[0]) { + Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]); + } else { + Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts - + VEXTOffsets[1]); + } } } // Final check before we try to produce nonsense... - if (isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], - &Mask[0]); + if (isShuffleMaskLegal(Mask, ShuffleVT)) { + SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleSrcs[0], + ShuffleSrcs[1], &Mask[0]); + return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + } return SDValue(); } Index: test/CodeGen/AArch64/neon-perm.ll =================================================================== --- test/CodeGen/AArch64/neon-perm.ll +++ test/CodeGen/AArch64/neon-perm.ll @@ -1387,6 +1387,13 @@ ret <8 x i16> %shuffle.i } +define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) { +; CHECK-LABEL: test_vzip1_v4i8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> + ret <4 x i8> %lo +} + define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) { ; CHECK-LABEL: test_same_vzip2_s8: ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b