diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10213,18 +10213,17 @@ assert(!VT.isScalableVector() && "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); - // Can only recreate a shuffle with 16x i8 elements, as this maps directly to - // TBL1. - if (VT.getSimpleVT() != MVT::v16i8) + // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map + // directly to TBL1. + if (VT.getSimpleVT() != MVT::v16i8 && VT.getSimpleVT() != MVT::v8i8) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); - assert(NumElts == 16 && "Need to have exactly 16 elements in vector."); + assert((NumElts == 8 || NumElts == 16) && + "Need to have exactly 8 or 16 elements in vector."); SDValue SourceVec; SDValue MaskSourceVec; - - uint64_t AndConstant = 0; SmallVector AndMaskConstants; for (unsigned i = 0; i < NumElts; ++i) { @@ -10241,15 +10240,11 @@ ConstantSDNode *ConstantNode = cast(Operand.getOperand(1)); AndMaskConstants.push_back(SDValue(ConstantNode, 0)); - uint64_t ConstantVal = ConstantNode->getZExtValue(); - if (ConstantVal >= NumElts) - return SDValue(); SDValue OperandSourceVec = V.getOperand(0); - if (!SourceVec) { + if (!SourceVec) SourceVec = OperandSourceVec; - AndConstant = ConstantVal; - } else if (SourceVec != OperandSourceVec || ConstantVal != AndConstant) + else if (SourceVec != OperandSourceVec) return SDValue(); // Find source vector of mask to use later in TBL. @@ -10263,13 +10258,26 @@ if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); - // We only apply this if all elements come from the same vector. - if (!MaskSourceVec) + // We only apply this if all elements come from the same vector with the + // same vector type. + if (!MaskSourceVec) { MaskSourceVec = MaskSource->getOperand(0); - else if (MaskSourceVec != MaskSource->getOperand(0)) + if (MaskSourceVec.getValueType() != VT) + return SDValue(); + } else if (MaskSourceVec != MaskSource->getOperand(0)) { return SDValue(); + } } + // We need a v16i8 for TBL, so we extend the source with a placeholder vector + // for v8i8 to get a v16i8. As the pattern we are replacing is extract + + // insert, we know that the index in the mask must be smaller than the number + // of elements in the source, or we would have an out-of-bounds access. So we + // can simply duplicate the source vector. + if (NumElts == 8) + SourceVec = + DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec, SourceVec); + // Preconditions met, so we can use a vector AND + TBL to build this vector. SDValue AndMask = DAG.getBuildVector(VT, dl, AndMaskConstants); SDValue MaskedVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec, AndMask); diff --git a/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll b/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll --- a/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll +++ b/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll @@ -1,8 +1,8 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s ; This is the IR generated by Clang's __builtin_shufflevector for two 16x uint8_t vectors. -define <16 x i8> @shuffle_with_and_mask(<16 x i8> %src, <16 x i8> %mask) { -; CHECK-LABEL: shuffle_with_and_mask: +define <16 x i8> @shuffle16_with_and_mask(<16 x i8> %src, <16 x i8> %mask) { +; CHECK-LABEL: shuffle16_with_and_mask: ; CHECK: // %bb.0: ; CHECK-NEXT: movi.16b v2, #15 ; CHECK-NEXT: and.16b v1, v1, v2 @@ -56,7 +56,182 @@ %44 = extractelement <16 x i8> %src, i8 %43 %45 = insertelement <16 x i8> %42, i8 %44, i64 14 %46 = extractelement <16 x i8> %masked_mask, i64 15 + ; Make sure that ANY_EXTEND is ignored + %47 = zext i8 %46 to i32 + %48 = extractelement <16 x i8> %src, i32 %47 + %49 = insertelement <16 x i8> %45, i8 %48, i64 15 + ret <16 x i8> %49 +} + +define <8 x i8> @shuffle8_with_and_mask(<8 x i8> %src, <8 x i8> %mask) { +; CHECK-LABEL: shuffle8_with_and_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: movi.8b v2, #7 +; CHECK-NEXT: mov.d v0[1], v0[0] +; CHECK-NEXT: and.8b v1, v1, v2 +; CHECK-NEXT: tbl.8b v0, { v0 }, v1 +; CHECK-NEXT: ret + + %masked_mask = and <8 x i8> %mask, + %1 = extractelement <8 x i8> %masked_mask, i64 0 + %2 = extractelement <8 x i8> %src, i8 %1 + %3 = insertelement <8 x i8> undef, i8 %2, i64 0 + %4 = extractelement <8 x i8> %masked_mask, i64 1 + %5 = extractelement <8 x i8> %src, i8 %4 + %6 = insertelement <8 x i8> %3, i8 %5, i64 1 + %7 = extractelement <8 x i8> %masked_mask, i64 2 + %8 = extractelement <8 x i8> %src, i8 %7 + %9 = insertelement <8 x i8> %6, i8 %8, i64 2 + %10 = extractelement <8 x i8> %masked_mask, i64 3 + %11 = extractelement <8 x i8> %src, i8 %10 + %12 = insertelement <8 x i8> %9, i8 %11, i64 3 + %13 = extractelement <8 x i8> %masked_mask, i64 4 + %14 = extractelement <8 x i8> %src, i8 %13 + %15 = insertelement <8 x i8> %12, i8 %14, i64 4 + %16 = extractelement <8 x i8> %masked_mask, i64 5 + %17 = extractelement <8 x i8> %src, i8 %16 + %18 = insertelement <8 x i8> %15, i8 %17, i64 5 + %19 = extractelement <8 x i8> %masked_mask, i64 6 + %20 = extractelement <8 x i8> %src, i8 %19 + %21 = insertelement <8 x i8> %18, i8 %20, i64 6 + %22 = extractelement <8 x i8> %masked_mask, i64 7 + %23 = extractelement <8 x i8> %src, i8 %22 + %24 = insertelement <8 x i8> %21, i8 %23, i64 7 + ret <8 x i8> %24 +} + +; Takes alternating entries from two mask source vectors. Currently not supported. +define <16 x i8> @no_shuffle_with_two_mask_sources(<16 x i8> %src, <16 x i8> %mask1, <16 x i8> %mask2) { +; CHECK-LABEL: shuffle_with_two_mask_sources: +; CHECK: // %bb.0: +; CHECK-NOT: tbl.16b + + %masked_mask1 = and <16 x i8> %mask1, + %masked_mask2 = and <16 x i8> %mask2, + %1 = extractelement <16 x i8> %masked_mask1, i64 0 + %2 = extractelement <16 x i8> %src, i8 %1 + %3 = insertelement <16 x i8> undef, i8 %2, i64 0 + %4 = extractelement <16 x i8> %masked_mask2, i64 1 + %5 = extractelement <16 x i8> %src, i8 %4 + %6 = insertelement <16 x i8> %3, i8 %5, i64 1 + %7 = extractelement <16 x i8> %masked_mask1, i64 2 + %8 = extractelement <16 x i8> %src, i8 %7 + %9 = insertelement <16 x i8> %6, i8 %8, i64 2 + %10 = extractelement <16 x i8> %masked_mask2, i64 3 + %11 = extractelement <16 x i8> %src, i8 %10 + %12 = insertelement <16 x i8> %9, i8 %11, i64 3 + %13 = extractelement <16 x i8> %masked_mask1, i64 4 + %14 = extractelement <16 x i8> %src, i8 %13 + %15 = insertelement <16 x i8> %12, i8 %14, i64 4 + %16 = extractelement <16 x i8> %masked_mask2, i64 5 + %17 = extractelement <16 x i8> %src, i8 %16 + %18 = insertelement <16 x i8> %15, i8 %17, i64 5 + %19 = extractelement <16 x i8> %masked_mask1, i64 6 + %20 = extractelement <16 x i8> %src, i8 %19 + %21 = insertelement <16 x i8> %18, i8 %20, i64 6 + %22 = extractelement <16 x i8> %masked_mask2, i64 7 + %23 = extractelement <16 x i8> %src, i8 %22 + %24 = insertelement <16 x i8> %21, i8 %23, i64 7 + %25 = extractelement <16 x i8> %masked_mask1, i64 8 + %26 = extractelement <16 x i8> %src, i8 %25 + %27 = insertelement <16 x i8> %24, i8 %26, i64 8 + %28 = extractelement <16 x i8> %masked_mask2, i64 9 + %29 = extractelement <16 x i8> %src, i8 %28 + %30 = insertelement <16 x i8> %27, i8 %29, i64 9 + %31 = extractelement <16 x i8> %masked_mask1, i64 10 + %32 = extractelement <16 x i8> %src, i8 %31 + %33 = insertelement <16 x i8> %30, i8 %32, i64 10 + %34 = extractelement <16 x i8> %masked_mask2, i64 11 + %35 = extractelement <16 x i8> %src, i8 %34 + %36 = insertelement <16 x i8> %33, i8 %35, i64 11 + %37 = extractelement <16 x i8> %masked_mask1, i64 12 + %38 = extractelement <16 x i8> %src, i8 %37 + %39 = insertelement <16 x i8> %36, i8 %38, i64 12 + %40 = extractelement <16 x i8> %masked_mask2, i64 13 + %41 = extractelement <16 x i8> %src, i8 %40 + %42 = insertelement <16 x i8> %39, i8 %41, i64 13 + %43 = extractelement <16 x i8> %masked_mask1, i64 14 + %44 = extractelement <16 x i8> %src, i8 %43 + %45 = insertelement <16 x i8> %42, i8 %44, i64 14 + %46 = extractelement <16 x i8> %masked_mask2, i64 15 %47 = extractelement <16 x i8> %src, i8 %46 %48 = insertelement <16 x i8> %45, i8 %47, i64 15 ret <16 x i8> %48 } + +; Non-supported vector type. +define <4 x i32> @no_for_shuffle_int_vector(<4 x i32> %src, <4 x i32> %mask) { +; CHECK-LABEL: no_for_shuffle_int_vector: +; CHECK: // %bb.0: +; CHECK-NOT: tbl.16b + + %masked_mask = and <4 x i32> %mask, + %1 = extractelement <4 x i32> %masked_mask, i64 0 + %2 = extractelement <4 x i32> %src, i32 %1 + %3 = insertelement <4 x i32> undef, i32 %2, i64 0 + %4 = extractelement <4 x i32> %masked_mask, i64 1 + %5 = extractelement <4 x i32> %src, i32 %4 + %6 = insertelement <4 x i32> %3, i32 %5, i64 1 + %7 = extractelement <4 x i32> %masked_mask, i64 2 + %8 = extractelement <4 x i32> %src, i32 %7 + %9 = insertelement <4 x i32> %6, i32 %8, i64 2 + %10 = extractelement <4 x i32> %masked_mask, i64 3 + %11 = extractelement <4 x i32> %src, i32 %10 + %12 = insertelement <4 x i32> %9, i32 %11, i64 3 + ret <4 x i32> %12 +} + +define <8 x i8> @no_shuffle_not_enough_elements(<8 x i8> %src, <8 x i8> %mask) { +; CHECK-LABEL: no_shuffle_not_enough_elements: +; CHECK: // %bb.0: +; CHECK-NOT: tbl.16b + + %masked_mask = and <8 x i8> %mask, + %1 = extractelement <8 x i8> %masked_mask, i64 0 + %2 = extractelement <8 x i8> %src, i8 %1 + %3 = insertelement <8 x i8> undef, i8 %2, i64 0 + %4 = extractelement <8 x i8> %masked_mask, i64 1 + %5 = extractelement <8 x i8> %src, i8 %4 + %6 = insertelement <8 x i8> %3, i8 %5, i64 1 + %7 = extractelement <8 x i8> %masked_mask, i64 2 + %8 = extractelement <8 x i8> %src, i8 %7 + %9 = insertelement <8 x i8> %6, i8 %8, i64 2 + %10 = extractelement <8 x i8> %masked_mask, i64 3 + %11 = extractelement <8 x i8> %src, i8 %10 + %12 = insertelement <8 x i8> %9, i8 %11, i64 3 + ret <8 x i8> %12 +} + +define <8 x i8> @no_shuffle_different_vector_types(<8 x i8> %src, <16 x i8> %mask) { +; CHECK-LABEL: no_shuffle_different_vector_types: +; CHECK: // %bb.0: +; CHECK-NOT: tbl.16b + + %masked_mask = and <16 x i8> %mask, + %1 = extractelement <16 x i8> %masked_mask, i64 0 + %2 = extractelement <8 x i8> %src, i8 %1 + %3 = insertelement <8 x i8> undef, i8 %2, i64 0 + %4 = extractelement <16 x i8> %masked_mask, i64 1 + %5 = extractelement <8 x i8> %src, i8 %4 + %6 = insertelement <8 x i8> %3, i8 %5, i64 1 + %7 = extractelement <16 x i8> %masked_mask, i64 2 + %8 = extractelement <8 x i8> %src, i8 %7 + %9 = insertelement <8 x i8> %6, i8 %8, i64 2 + %10 = extractelement <16 x i8> %masked_mask, i64 3 + %11 = extractelement <8 x i8> %src, i8 %10 + %12 = insertelement <8 x i8> %9, i8 %11, i64 3 + %13 = extractelement <16 x i8> %masked_mask, i64 4 + %14 = extractelement <8 x i8> %src, i8 %13 + %15 = insertelement <8 x i8> %12, i8 %14, i64 4 + %16 = extractelement <16 x i8> %masked_mask, i64 5 + %17 = extractelement <8 x i8> %src, i8 %16 + %18 = insertelement <8 x i8> %15, i8 %17, i64 5 + %19 = extractelement <16 x i8> %masked_mask, i64 6 + %20 = extractelement <8 x i8> %src, i8 %19 + %21 = insertelement <8 x i8> %18, i8 %20, i64 6 + %22 = extractelement <16 x i8> %masked_mask, i64 7 + %23 = extractelement <8 x i8> %src, i8 %22 + %24 = insertelement <8 x i8> %21, i8 %23, i64 7 + ret <8 x i8> %24 +}