diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10663,6 +10663,72 @@ return SDValue(); } +static bool isExtractLowerHalfMask(ArrayRef Mask) { + if (Mask.size() != 16) + return false; + + for (int I = 0; I < 8; ++I) { + if (Mask[I] != I) + return false; + if (Mask[I + 8] != 16 + I) + return false; + } + return true; +} + +// Try to fold shuffle (tbl2, tbl2) into a single tbl4 if the shuffle selects +// the first 8 elements of each tbl2 and the tbl2 masks can be merged. +static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, + ArrayRef ShuffleMask, + SelectionDAG &DAG) { + if (!isExtractLowerHalfMask(ShuffleMask)) + return SDValue(); + + SDValue Tbl1 = Op->getOperand(0); + SDValue Tbl2 = Op->getOperand(1); + SDLoc dl(Op); + SDValue Tbl2ID = + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64); + + EVT VT = Op.getValueType(); + if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN || + Tbl1->getOperand(0) != Tbl2ID || + Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN || + Tbl2->getOperand(0) != Tbl2ID) + return SDValue(); + + SDValue Mask1 = Tbl1->getOperand(3); + SDValue Mask2 = Tbl2->getOperand(3); + // Make sure the tbl2 mask only selects values in the first 8 lanes (i.e. the + // last 8 lanes all have an index of -1). + auto IsLowerExtractMask = [](SDValue Mask) { + if (Mask->getOpcode() != ISD::BUILD_VECTOR) + return false; + for (unsigned I = 8; I < 16; I++) { + auto *C = dyn_cast(Mask->getOperand(I)); + if (!C || C->getSExtValue() != -1) + return false; + } + return true; + }; + if (!IsLowerExtractMask(Mask1) || !IsLowerExtractMask(Mask2)) + return SDValue(); + SmallVector TBLMaskParts(16, Mask1->getOperand(0)); + for (unsigned I = 0; I < 8; I++) { + TBLMaskParts[I] = Mask1->getOperand(I); + auto *C = cast(Mask2->getOperand(I)); + TBLMaskParts[I + 8] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32); + } + + SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts); + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64); + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8, + {ID, Tbl1->getOperand(1), Tbl1->getOperand(2), + Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask}); +} + SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -10686,6 +10752,9 @@ assert(ShuffleMask.size() == VT.getVectorNumElements() && "Unexpected VECTOR_SHUFFLE mask size!"); + if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG)) + return Res; + if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll --- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll @@ -100,27 +100,25 @@ ; CHECK-NEXT: .byte 20 // 0x14 ; CHECK-NEXT: .byte 24 // 0x18 ; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 32 // 0x20 +; CHECK-NEXT: .byte 36 // 0x24 +; CHECK-NEXT: .byte 40 // 0x28 +; CHECK-NEXT: .byte 44 // 0x2c +; CHECK-NEXT: .byte 48 // 0x30 +; CHECK-NEXT: .byte 52 // 0x34 +; CHECK-NEXT: .byte 56 // 0x38 +; CHECK-NEXT: .byte 60 // 0x3c define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: shuffled_tbl2_to_tbl4: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4 -; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v4 -; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 ; CHECK-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> )