diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10203,6 +10203,82 @@ return EltType.getSizeInBits() / 8; } +// Check if a vector is built from one vector via extracted elements of +// another together with an AND mask, ensuring that all elements fit +// within range. This can be reconstructed using AND and NEON's TBL1. +SDValue ReconstructShuffleWithConstantAndMask(SDValue Op, SelectionDAG &DAG) { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + assert(!VT.isScalableVector() && + "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); + + // Can only recreate a shuffle with 16x i8 elements, as this maps directly to + // TBL1. + if (VT.getSimpleVT() != MVT::v16i8) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + assert(NumElts == 16 && "Need to have exactly 16 elements in vector."); + + SDValue SourceVec; + SDValue MaskSourceVec; + + uint64_t AndConstant = 0; + SmallVector AndMaskConstants; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // This only looks at shuffles with elements that are truncated by a + // constant AND mask extracted from a mask vector. + SDValue Operand = V.getOperand(1); + if (Operand.getOpcode() != ISD::AND || + !isa(Operand.getOperand(1))) + return SDValue(); + + ConstantSDNode *ConstantNode = cast(Operand.getOperand(1)); + AndMaskConstants.push_back(SDValue(ConstantNode, 0)); + uint64_t ConstantVal = ConstantNode->getZExtValue(); + if (ConstantVal >= NumElts) + return SDValue(); + + SDValue OperandSourceVec = V.getOperand(0); + if (!SourceVec) { + SourceVec = OperandSourceVec; + AndConstant = ConstantVal; + } else if (SourceVec != OperandSourceVec || ConstantVal != AndConstant) + return SDValue(); + + // Find source vector of mask to use later in TBL. + SDValue MaskSource = Operand.getOperand(0); + + // An ANY_EXTEND may be inserted between the AND and the source vector + // extraction. We don't care about that, so we can just skip it. + if (MaskSource.getOpcode() == ISD::ANY_EXTEND) + MaskSource = MaskSource.getOperand(0); + + if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // We only apply this if all elements come from the same vector. + if (!MaskSourceVec) + MaskSourceVec = MaskSource->getOperand(0); + else if (MaskSourceVec != MaskSource->getOperand(0)) + return SDValue(); + } + + // Preconditions met, so we can use a vector AND + TBL to build this vector. + SDValue AndMask = DAG.getBuildVector(VT, dl, AndMaskConstants); + SDValue MaskedVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec, AndMask); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec, + MaskedVec); +} + // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, @@ -12344,8 +12420,11 @@ // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { - if (SDValue shuffle = ReconstructShuffle(Op, DAG)) - return shuffle; + if (SDValue Shuffle = ReconstructShuffle(Op, DAG)) + return Shuffle; + + if (SDValue Shuffle = ReconstructShuffleWithConstantAndMask(Op, DAG)) + return Shuffle; } if (PreferDUPAndInsert) { diff --git a/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll b/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-shuffle-vector-tbl.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s + +; This is the IR generated by Clang's __builtin_shufflevector for two 16x uint8_t vectors. +define <16 x i8> @shuffle_with_and_mask(<16 x i8> %src, <16 x i8> %mask) { +; CHECK-LABEL: shuffle_with_and_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: movi.16b v2, #15 +; CHECK-NEXT: and.16b v1, v1, v2 +; CHECK-NEXT: tbl.16b v0, { v0 }, v1 +; CHECK-NEXT: ret + + %masked_mask = and <16 x i8> %mask, + %1 = extractelement <16 x i8> %masked_mask, i64 0 + %2 = extractelement <16 x i8> %src, i8 %1 + %3 = insertelement <16 x i8> undef, i8 %2, i64 0 + %4 = extractelement <16 x i8> %masked_mask, i64 1 + %5 = extractelement <16 x i8> %src, i8 %4 + %6 = insertelement <16 x i8> %3, i8 %5, i64 1 + %7 = extractelement <16 x i8> %masked_mask, i64 2 + %8 = extractelement <16 x i8> %src, i8 %7 + %9 = insertelement <16 x i8> %6, i8 %8, i64 2 + %10 = extractelement <16 x i8> %masked_mask, i64 3 + %11 = extractelement <16 x i8> %src, i8 %10 + %12 = insertelement <16 x i8> %9, i8 %11, i64 3 + %13 = extractelement <16 x i8> %masked_mask, i64 4 + %14 = extractelement <16 x i8> %src, i8 %13 + %15 = insertelement <16 x i8> %12, i8 %14, i64 4 + %16 = extractelement <16 x i8> %masked_mask, i64 5 + %17 = extractelement <16 x i8> %src, i8 %16 + %18 = insertelement <16 x i8> %15, i8 %17, i64 5 + %19 = extractelement <16 x i8> %masked_mask, i64 6 + %20 = extractelement <16 x i8> %src, i8 %19 + %21 = insertelement <16 x i8> %18, i8 %20, i64 6 + %22 = extractelement <16 x i8> %masked_mask, i64 7 + %23 = extractelement <16 x i8> %src, i8 %22 + %24 = insertelement <16 x i8> %21, i8 %23, i64 7 + %25 = extractelement <16 x i8> %masked_mask, i64 8 + %26 = extractelement <16 x i8> %src, i8 %25 + %27 = insertelement <16 x i8> %24, i8 %26, i64 8 + %28 = extractelement <16 x i8> %masked_mask, i64 9 + %29 = extractelement <16 x i8> %src, i8 %28 + %30 = insertelement <16 x i8> %27, i8 %29, i64 9 + %31 = extractelement <16 x i8> %masked_mask, i64 10 + %32 = extractelement <16 x i8> %src, i8 %31 + %33 = insertelement <16 x i8> %30, i8 %32, i64 10 + %34 = extractelement <16 x i8> %masked_mask, i64 11 + %35 = extractelement <16 x i8> %src, i8 %34 + %36 = insertelement <16 x i8> %33, i8 %35, i64 11 + %37 = extractelement <16 x i8> %masked_mask, i64 12 + %38 = extractelement <16 x i8> %src, i8 %37 + %39 = insertelement <16 x i8> %36, i8 %38, i64 12 + %40 = extractelement <16 x i8> %masked_mask, i64 13 + %41 = extractelement <16 x i8> %src, i8 %40 + %42 = insertelement <16 x i8> %39, i8 %41, i64 13 + %43 = extractelement <16 x i8> %masked_mask, i64 14 + %44 = extractelement <16 x i8> %src, i8 %43 + %45 = insertelement <16 x i8> %42, i8 %44, i64 14 + %46 = extractelement <16 x i8> %masked_mask, i64 15 + %47 = extractelement <16 x i8> %src, i8 %46 + %48 = insertelement <16 x i8> %45, i8 %47, i64 15 + ret <16 x i8> %48 +}