Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4155,9 +4155,29 @@ if (SourceVecs.size() > 2) return SDValue(); - SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; int VEXTOffsets[2] = { 0, 0 }; int OffsetMultipliers[2] = { 1, 1 }; + int ResMultiplier = 1; + // Find out the smallest element size among result and two sources, and use + // it as element size for the shuffle_vector to build. + EVT SmallestEltTy = VT.getVectorElementType(); + for (unsigned i = 0; i < SourceVecs.size(); ++i) { + EVT SrcEltTy = SourceVecs[i].getValueType().getVectorElementType(); + if (SrcEltTy.getSizeInBits() < SmallestEltTy.getSizeInBits()) { + // It may hit here if trying to build a small vector which is less + // than 64 bit. For example, extracting low part from v8i8 to build v4i8. + // Because v4i8 is illegal, it will be promoted to v4i16. + // For this example, we need to create a v8i8 shuffle_vector which only + // lane 0, 2, 4, 6 are valid. Also, it should be bitcasted back to + // original v4i8 at last. + ResMultiplier = SmallestEltTy.getSizeInBits() / SrcEltTy.getSizeInBits(); + SmallestEltTy = SrcEltTy; + } + } + EVT ShuffleVT = + EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, + VT.getSizeInBits() / SmallestEltTy.getSizeInBits()); + SDValue ShuffleSrcs[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)}; // This loop extracts the usage patterns of the source vectors // and prepares appropriate SDValues for a shuffle if possible. @@ -4165,15 +4185,15 @@ unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements(); SDValue CurSource = SourceVecs[i]; if (SourceVecs[i].getValueType().getVectorElementType() != - VT.getVectorElementType()) { + ShuffleVT.getVectorElementType()) { // It may hit this case if SourceVecs[i] is AssertSext/AssertZext. // Then bitcast it to the vector which holds asserted element type, // and record the multiplier of element width between SourceVecs and // Build_vector which is needed to extract the correct lanes later. - EVT CastVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - SourceVecs[i].getValueSizeInBits() / - VT.getVectorElementType().getSizeInBits()); + EVT CastVT = EVT::getVectorVT( + *DAG.getContext(), ShuffleVT.getVectorElementType(), + SourceVecs[i].getValueSizeInBits() / + ShuffleVT.getVectorElementType().getSizeInBits()); CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]); OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts; @@ -4182,7 +4202,7 @@ MinElts[i] *= OffsetMultipliers[i]; } - if (CurSource.getValueType() == VT) { + if (CurSource.getValueType() == ShuffleVT) { // No VEXT necessary ShuffleSrcs[i] = CurSource; VEXTOffsets[i] = 0; @@ -4190,8 +4210,9 @@ } else if (NumSrcElts < NumElts) { // We can pad out the smaller vector for free, so if it's part of a // shuffle... - ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource, - DAG.getUNDEF(CurSource.getValueType())); + ShuffleSrcs[i] = + DAG.getNode(ISD::CONCAT_VECTORS, dl, ShuffleVT, CurSource, + DAG.getUNDEF(CurSource.getValueType())); continue; } @@ -4208,23 +4229,23 @@ if (MinElts[i] >= NumElts) { // The extraction can just take the second half VEXTOffsets[i] = NumElts; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(NumElts)); + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ShuffleVT, + CurSource, DAG.getIntPtrConstant(NumElts)); } else if (MaxElts[i] < NumElts) { // The extraction can just take the first half VEXTOffsets[i] = 0; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(0)); + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ShuffleVT, + CurSource, DAG.getIntPtrConstant(0)); } else { // An actual VEXT is needed VEXTOffsets[i] = MinElts[i]; - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(0)); - SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(NumElts)); + SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ShuffleVT, + CurSource, DAG.getIntPtrConstant(0)); + SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ShuffleVT, + CurSource, DAG.getIntPtrConstant(NumElts)); unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1); - ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2, - DAG.getConstant(Imm, MVT::i32)); + ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, ShuffleVT, VEXTSrc1, + VEXTSrc2, DAG.getConstant(Imm, MVT::i32)); } } @@ -4234,24 +4255,29 @@ SDValue Entry = Op.getOperand(i); if (Entry.getOpcode() == ISD::UNDEF) { Mask.push_back(-1); - continue; - } - - SDValue ExtractVec = Entry.getOperand(0); - int ExtractElt = - cast(Op.getOperand(i).getOperand(1))->getSExtValue(); - if (ExtractVec == SourceVecs[0]) { - Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]); } else { - Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts - - VEXTOffsets[1]); + SDValue ExtractVec = Entry.getOperand(0); + int ExtractElt = + cast(Op.getOperand(i).getOperand(1))->getSExtValue(); + if (ExtractVec == SourceVecs[0]) { + Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]); + } else { + Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts - + VEXTOffsets[1]); + } } + for (int j = 0; j != ResMultiplier - 1; ++j) + Mask.push_back(-1); } // Final check before we try to produce nonsense... - if (isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], - &Mask[0]); + if (isShuffleMaskLegal(Mask, ShuffleVT)) { + SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleSrcs[0], + ShuffleSrcs[1], &Mask[0]); + if (ShuffleVT != VT) + Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + return Shuffle; + } return SDValue(); } Index: test/CodeGen/AArch64/neon-perm.ll =================================================================== --- test/CodeGen/AArch64/neon-perm.ll +++ test/CodeGen/AArch64/neon-perm.ll @@ -1387,6 +1387,13 @@ ret <8 x i16> %shuffle.i } +define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) { +; CHECK-LABEL: test_vzip1_v4i8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> + ret <4 x i8> %lo +} + define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) { ; CHECK-LABEL: test_same_vzip2_s8: ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b