diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10688,6 +10688,77 @@ return SDValue(); } +static bool isExtractLowerHalfMask(ArrayRef Mask) { + if (Mask.size() != 16) + return false; + + for (int I = 0; I < 8; ++I) { + if (Mask[I] != I) + return false; + if (Mask[I + 8] != 16 + I) + return false; + } + return true; +} + +// Try to fold shuffle (tbl2, tbl2) into a single tbl4 if the shuffle selects +// the first 8 elements of each tbl2 and the tbl2 masks can be merged. +static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, + ArrayRef ShuffleMask, + SelectionDAG &DAG) { + if (!isExtractLowerHalfMask(ShuffleMask)) + return SDValue(); + + SDValue Tbl1 = Op->getOperand(0); + SDValue Tbl2 = Op->getOperand(1); + SDLoc dl(Op); + SDValue Tbl2ID = + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64); + + EVT VT = Op.getValueType(); + if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN || + Tbl1->getOperand(0) != Tbl2ID || + Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN || + Tbl2->getOperand(0) != Tbl2ID) + return SDValue(); + + if (Tbl1->getValueType(0) != MVT::v16i8 || + Tbl2->getValueType(0) != MVT::v16i8) + return SDValue(); + + // Make sure the masks are BUILD_VECTORS where the first 8 operands are + // constant. + auto IsBuildVectorWithConstantOps = [](SDValue Mask) { + if (Mask->getOpcode() != ISD::BUILD_VECTOR) + return false; + for (unsigned I = 0; I < 8; I++) { + if (!isa(Mask->getOperand(I))) + return false; + } + return true; + }; + SDValue Mask1 = Tbl1->getOperand(3); + SDValue Mask2 = Tbl2->getOperand(3); + if (!IsBuildVectorWithConstantOps(Mask1) || + !IsBuildVectorWithConstantOps(Mask2)) + return SDValue(); + + SmallVector TBLMaskParts(16, SDValue()); + for (unsigned I = 0; I < 8; I++) { + TBLMaskParts[I] = Mask1->getOperand(I); + auto *C = cast(Mask2->getOperand(I)); + TBLMaskParts[I + 8] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32); + } + + SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts); + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64); + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8, + {ID, Tbl1->getOperand(1), Tbl1->getOperand(2), + Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask}); +} + SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -10711,6 +10782,9 @@ assert(ShuffleMask.size() == VT.getVectorNumElements() && "Unexpected VECTOR_SHUFFLE mask size!"); + if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG)) + return Res; + if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll --- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll @@ -130,27 +130,25 @@ ; CHECK-NEXT: .byte 20 // 0x14 ; CHECK-NEXT: .byte 24 // 0x18 ; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 32 // 0x20 +; CHECK-NEXT: .byte 36 // 0x24 +; CHECK-NEXT: .byte 40 // 0x28 +; CHECK-NEXT: .byte 44 // 0x2c +; CHECK-NEXT: .byte 48 // 0x30 +; CHECK-NEXT: .byte 52 // 0x34 +; CHECK-NEXT: .byte 56 // 0x38 +; CHECK-NEXT: .byte 60 // 0x3c define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: shuffled_tbl2_to_tbl4: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4 -; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v4 -; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 ; CHECK-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll @@ -94,14 +94,14 @@ ; CHECK-NEXT: .byte 20 ; 0x14 ; CHECK-NEXT: .byte 24 ; 0x18 ; CHECK-NEXT: .byte 28 ; 0x1c -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 32 ; 0x20 +; CHECK-NEXT: .byte 36 ; 0x24 +; CHECK-NEXT: .byte 40 ; 0x28 +; CHECK-NEXT: .byte 44 ; 0x2c +; CHECK-NEXT: .byte 48 ; 0x30 +; CHECK-NEXT: .byte 52 ; 0x34 +; CHECK-NEXT: .byte 56 ; 0x38 +; CHECK-NEXT: .byte 60 ; 0x3c ; Tbl can also be used when combining multiple fptoui using a shuffle. The loop ; vectorizer may create such patterns. @@ -118,16 +118,14 @@ ; CHECK-NEXT: lsl x9, x8, #5 ; CHECK-NEXT: add x10, x0, x9 ; CHECK-NEXT: add x9, x1, x9 -; CHECK-NEXT: ldp q1, q2, [x10] +; CHECK-NEXT: ldp q2, q1, [x10] ; CHECK-NEXT: ldp q4, q3, [x9] -; CHECK-NEXT: fcvtzu.4s v6, v2 -; CHECK-NEXT: fcvtzu.4s v5, v1 -; CHECK-NEXT: fcvtzu.4s v2, v3 -; CHECK-NEXT: fcvtzu.4s v1, v4 -; CHECK-NEXT: tbl.16b v3, { v5, v6 }, v0 -; CHECK-NEXT: tbl.16b v1, { v1, v2 }, v0 -; CHECK-NEXT: mov.d v3[1], v1[0] -; CHECK-NEXT: str q3, [x2, x8, lsl #4] +; CHECK-NEXT: fcvtzu.4s v17, v1 +; CHECK-NEXT: fcvtzu.4s v16, v2 +; CHECK-NEXT: fcvtzu.4s v19, v3 +; CHECK-NEXT: fcvtzu.4s v18, v4 +; CHECK-NEXT: tbl.16b v1, { v16, v17, v18, v19 }, v0 +; CHECK-NEXT: str q1, [x2, x8, lsl #4] ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 ; CHECK-NEXT: b.eq LBB2_1