diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10795,6 +10795,51 @@ return SDValue(); } +// Try to fold shuffle (tbl2, tbl2) into a single tbl4. +static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, + ArrayRef ShuffleMask, + SelectionDAG &DAG) { + SDValue Tbl1 = Op->getOperand(0); + SDValue Tbl2 = Op->getOperand(1); + SDLoc dl(Op); + SDValue Tbl2ID = + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64); + + EVT VT = Op.getValueType(); + if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN || + Tbl1->getOperand(0) != Tbl2ID || + Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN || + Tbl2->getOperand(0) != Tbl2ID) + return SDValue(); + + if (Tbl1->getValueType(0) != MVT::v16i8 || + Tbl2->getValueType(0) != MVT::v16i8) + return SDValue(); + + SDValue Mask1 = Tbl1->getOperand(3); + SDValue Mask2 = Tbl2->getOperand(3); + SmallVector TBLMaskParts(16, SDValue()); + for (unsigned I = 0; I < 16; I++) { + if (ShuffleMask[I] < 16) + TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]); + else { + auto *C = + dyn_cast(Mask2->getOperand(ShuffleMask[I] - 16)); + if (!C) + return SDValue(); + TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32); + } + } + + SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts); + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64); + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8, + {ID, Tbl1->getOperand(1), Tbl1->getOperand(2), + Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask}); +} + SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -10818,6 +10863,9 @@ assert(ShuffleMask.size() == VT.getVectorNumElements() && "Unexpected VECTOR_SHUFFLE mask size!"); + if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG)) + return Res; + if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll --- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll @@ -130,27 +130,25 @@ ; CHECK-NEXT: .byte 20 // 0x14 ; CHECK-NEXT: .byte 24 // 0x18 ; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 32 // 0x20 +; CHECK-NEXT: .byte 36 // 0x24 +; CHECK-NEXT: .byte 40 // 0x28 +; CHECK-NEXT: .byte 44 // 0x2c +; CHECK-NEXT: .byte 48 // 0x30 +; CHECK-NEXT: .byte 52 // 0x34 +; CHECK-NEXT: .byte 56 // 0x38 +; CHECK-NEXT: .byte 60 // 0x3c define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: shuffled_tbl2_to_tbl4: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4 -; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v4 -; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 ; CHECK-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) @@ -161,24 +159,35 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { ; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v4, #0xffffffffffffffff -; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: mov.b v4[0], w0 -; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5 +; CHECK-NEXT: fmov s4, w0 +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: mov.b v4[1], w0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: mov.b v4[2], w0 ; CHECK-NEXT: mov.b v4[3], w0 ; CHECK-NEXT: mov.b v4[4], w0 ; CHECK-NEXT: mov.b v4[5], w0 ; CHECK-NEXT: mov.b v4[6], w0 ; CHECK-NEXT: mov.b v4[7], w0 -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4 -; CHECK-NEXT: mov.d v0[1], v2[0] +; CHECK-NEXT: mov.b v4[8], w8 +; CHECK-NEXT: mov w8, #36 +; CHECK-NEXT: mov.b v4[9], w8 +; CHECK-NEXT: mov w8, #40 +; CHECK-NEXT: mov.b v4[10], w8 +; CHECK-NEXT: mov w8, #44 +; CHECK-NEXT: mov.b v4[11], w8 +; CHECK-NEXT: mov w8, #48 +; CHECK-NEXT: mov.b v4[12], w8 +; CHECK-NEXT: mov w8, #52 +; CHECK-NEXT: mov.b v4[13], w8 +; CHECK-NEXT: mov w8, #56 +; CHECK-NEXT: mov.b v4[14], w8 +; CHECK-NEXT: mov w8, #60 +; CHECK-NEXT: mov.b v4[15], w8 +; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 ; CHECK-NEXT: ret %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 @@ -202,16 +211,72 @@ ret <16 x i8> %s } +define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { +; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: mov.b v4[1], w8 +; CHECK-NEXT: mov.b v4[2], w8 +; CHECK-NEXT: mov.b v4[3], w8 +; CHECK-NEXT: mov.b v4[4], w8 +; CHECK-NEXT: mov.b v4[5], w8 +; CHECK-NEXT: mov.b v4[6], w8 +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov.b v4[7], w0 +; CHECK-NEXT: mov.b v4[8], w8 +; CHECK-NEXT: mov w8, #36 +; CHECK-NEXT: mov.b v4[9], w8 +; CHECK-NEXT: mov w8, #40 +; CHECK-NEXT: mov.b v4[10], w8 +; CHECK-NEXT: mov w8, #44 +; CHECK-NEXT: mov.b v4[11], w8 +; CHECK-NEXT: mov w8, #48 +; CHECK-NEXT: mov.b v4[12], w8 +; CHECK-NEXT: mov w8, #52 +; CHECK-NEXT: mov.b v4[13], w8 +; CHECK-NEXT: mov w8, #56 +; CHECK-NEXT: mov.b v4[14], w8 +; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov.b v4[15], w8 +; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-NEXT: ret + %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0 + %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1 + %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2 + %ins.3 = insertelement <16 x i8> %ins.2, i8 1, i32 3 + %ins.4 = insertelement <16 x i8> %ins.3, i8 1, i32 4 + %ins.5 = insertelement <16 x i8> %ins.4, i8 1, i32 5 + %ins.6 = insertelement <16 x i8> %ins.5, i8 1, i32 6 + %ins.7 = insertelement <16 x i8> %ins.6, i8 1, i32 7 + %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8 + %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9 + %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10 + %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11 + %ins.12 = insertelement <16 x i8> %ins.11, i8 %v, i32 12 + %ins.13 = insertelement <16 x i8> %ins.12, i8 %v, i32 13 + %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14 + %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15 + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { ; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask: ; CHECK: // %bb.0: ; CHECK-NEXT: movi.2d v4, #0xffffffffffffffff -; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: adrp x8, .LCPI12_0 ; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI12_0] ; CHECK-NEXT: mov.b v4[0], w0 ; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5 ; CHECK-NEXT: mov.b v4[1], w0 @@ -247,20 +312,80 @@ ret <16 x i8> %s } -define <16 x i8> @shuffled_tbl2_to_tbl4_incompatible_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_incompatible_shuffle: +define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { +; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: dup.16b v4, w0 +; CHECK-NEXT: adrp x9, .LCPI13_0 ; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_0] -; CHECK-NEXT: adrp x8, .LCPI12_1 -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4 -; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v4 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: mov.b v4[8], w8 +; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI13_0] +; CHECK-NEXT: mov.b v4[9], w8 +; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5 +; CHECK-NEXT: mov.b v4[10], w8 +; CHECK-NEXT: mov.b v4[11], w8 +; CHECK-NEXT: mov.b v4[12], w8 +; CHECK-NEXT: mov.b v4[13], w8 +; CHECK-NEXT: adrp x8, .LCPI13_1 +; CHECK-NEXT: tbl.16b v3, { v0, v1 }, v4 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI13_1] +; CHECK-NEXT: tbl.16b v0, { v2, v3 }, v0 +; CHECK-NEXT: ret + %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 + %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 + %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 + %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3 + %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4 + %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5 + %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6 + %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7 + %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8 + %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9 + %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10 + %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11 + %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12 + %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13 + %ins.14 = insertelement <16 x i8> %ins.13, i8 %v, i32 14 + %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15 + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + + +; CHECK-LABEL: .LCPI14_0: +; CHECK-NEXT: .byte 0 // 0x0 +; CHECK-NEXT: .byte 4 // 0x4 +; CHECK-NEXT: .byte 52 // 0x34 +; CHECK-NEXT: .byte 12 // 0xc +; CHECK-NEXT: .byte 16 // 0x10 +; CHECK-NEXT: .byte 20 // 0x14 +; CHECK-NEXT: .byte 24 // 0x18 +; CHECK-NEXT: .byte 28 // 0x1c +; CHECK-NEXT: .byte 32 // 0x20 +; CHECK-NEXT: .byte 36 // 0x24 +; CHECK-NEXT: .byte 40 // 0x28 +; CHECK-NEXT: .byte 44 // 0x2c +; CHECK-NEXT: .byte 48 // 0x30 +; CHECK-NEXT: .byte 52 // 0x34 +; CHECK-NEXT: .byte 56 // 0x38 +; CHECK-NEXT: .byte 60 // 0x3c + +define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 ; CHECK-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) @@ -268,22 +393,34 @@ ret <16 x i8> %s } -define <16 x i8> @shuffled_tbl2_to_tbl4_incompatible_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_incompatible_tbl2_mask1: +; CHECK-LABEL: .LCPI15_0: +; CHECK-NEXT: .byte 0 // 0x0 +; CHECK-NEXT: .byte 4 // 0x4 +; CHECK-NEXT: .byte 52 // 0x34 +; CHECK-NEXT: .byte 12 // 0xc +; CHECK-NEXT: .byte 16 // 0x10 +; CHECK-NEXT: .byte 20 // 0x14 +; CHECK-NEXT: .byte 24 // 0x18 +; CHECK-NEXT: .byte 28 // 0x1c +; CHECK-NEXT: .byte 32 // 0x20 +; CHECK-NEXT: .byte 36 // 0x24 +; CHECK-NEXT: .byte 40 // 0x28 +; CHECK-NEXT: .byte 44 // 0x2c +; CHECK-NEXT: .byte 48 // 0x30 +; CHECK-NEXT: .byte 52 // 0x34 +; CHECK-NEXT: .byte 56 // 0x38 +; CHECK-NEXT: .byte 60 // 0x3c + +define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: adrp x9, .LCPI13_1 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: adrp x8, .LCPI13_2 -; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI13_1] -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4 -; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v5 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_2] -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-NEXT: adrp x8, .LCPI15_0 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 ; CHECK-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) @@ -291,22 +428,34 @@ ret <16 x i8> %s } -define <16 x i8> @shuffled_tbl2_to_tbl4_incompatible_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_incompatible_tbl2_mask2: +; CHECK-LABEL: .LCPI16_0: +; CHECK-NEXT: .byte 0 // 0x0 +; CHECK-NEXT: .byte 4 // 0x4 +; CHECK-NEXT: .byte 52 // 0x34 +; CHECK-NEXT: .byte 12 // 0xc +; CHECK-NEXT: .byte 16 // 0x10 +; CHECK-NEXT: .byte 20 // 0x14 +; CHECK-NEXT: .byte 24 // 0x18 +; CHECK-NEXT: .byte 28 // 0x1c +; CHECK-NEXT: .byte 32 // 0x20 +; CHECK-NEXT: .byte 36 // 0x24 +; CHECK-NEXT: .byte 40 // 0x28 +; CHECK-NEXT: .byte 44 // 0x2c +; CHECK-NEXT: .byte 48 // 0x30 +; CHECK-NEXT: .byte 52 // 0x34 +; CHECK-NEXT: .byte 56 // 0x38 +; CHECK-NEXT: .byte 60 // 0x3c + +define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: adrp x9, .LCPI14_1 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: adrp x8, .LCPI14_2 -; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI14_1] -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4 -; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v5 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_2] -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 ; CHECK-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll @@ -94,14 +94,14 @@ ; CHECK-NEXT: .byte 20 ; 0x14 ; CHECK-NEXT: .byte 24 ; 0x18 ; CHECK-NEXT: .byte 28 ; 0x1c -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 32 ; 0x20 +; CHECK-NEXT: .byte 36 ; 0x24 +; CHECK-NEXT: .byte 40 ; 0x28 +; CHECK-NEXT: .byte 44 ; 0x2c +; CHECK-NEXT: .byte 48 ; 0x30 +; CHECK-NEXT: .byte 52 ; 0x34 +; CHECK-NEXT: .byte 56 ; 0x38 +; CHECK-NEXT: .byte 60 ; 0x3c ; Tbl can also be used when combining multiple fptoui using a shuffle. The loop ; vectorizer may create such patterns. @@ -118,16 +118,14 @@ ; CHECK-NEXT: lsl x9, x8, #5 ; CHECK-NEXT: add x10, x0, x9 ; CHECK-NEXT: add x9, x1, x9 -; CHECK-NEXT: ldp q1, q2, [x10] +; CHECK-NEXT: ldp q2, q1, [x10] ; CHECK-NEXT: ldp q4, q3, [x9] -; CHECK-NEXT: fcvtzu.4s v6, v2 -; CHECK-NEXT: fcvtzu.4s v5, v1 -; CHECK-NEXT: fcvtzu.4s v2, v3 -; CHECK-NEXT: fcvtzu.4s v1, v4 -; CHECK-NEXT: tbl.16b v3, { v5, v6 }, v0 -; CHECK-NEXT: tbl.16b v1, { v1, v2 }, v0 -; CHECK-NEXT: mov.d v3[1], v1[0] -; CHECK-NEXT: str q3, [x2, x8, lsl #4] +; CHECK-NEXT: fcvtzu.4s v17, v1 +; CHECK-NEXT: fcvtzu.4s v16, v2 +; CHECK-NEXT: fcvtzu.4s v19, v3 +; CHECK-NEXT: fcvtzu.4s v18, v4 +; CHECK-NEXT: tbl.16b v1, { v16, v17, v18, v19 }, v0 +; CHECK-NEXT: str q1, [x2, x8, lsl #4] ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 ; CHECK-NEXT: b.eq LBB2_1 @@ -157,75 +155,50 @@ } ; CHECK-LABEL: lCPI3_0: -; CHECK-NEXT: .byte 0 ; 0x0 -; CHECK-NEXT: .byte 4 ; 0x4 -; CHECK-NEXT: .byte 8 ; 0x8 -; CHECK-NEXT: .byte 12 ; 0xc -; CHECK-NEXT: .byte 16 ; 0x10 -; CHECK-NEXT: .byte 20 ; 0x14 -; CHECK-NEXT: .byte 24 ; 0x18 -; CHECK-NEXT: .byte 28 ; 0x1c -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: lCPI3_1: -; CHECK-NEXT: .byte 0 ; 0x0 -; CHECK-NEXT: .byte 17 ; 0x11 -; CHECK-NEXT: .byte 2 ; 0x2 -; CHECK-NEXT: .byte 3 ; 0x3 -; CHECK-NEXT: .byte 4 ; 0x4 -; CHECK-NEXT: .byte 5 ; 0x5 -; CHECK-NEXT: .byte 6 ; 0x6 -; CHECK-NEXT: .byte 19 ; 0x13 -; CHECK-NEXT: .byte 16 ; 0x10 -; CHECK-NEXT: .byte 17 ; 0x11 -; CHECK-NEXT: .byte 18 ; 0x12 -; CHECK-NEXT: .byte 19 ; 0x13 -; CHECK-NEXT: .byte 20 ; 0x14 -; CHECK-NEXT: .byte 3 ; 0x3 -; CHECK-NEXT: .byte 22 ; 0x16 -; CHECK-NEXT: .byte 23 ; 0x17 +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 36 ; 0x24 +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 16 ; 0x10 +; CHECK-NEXT: .byte 20 ; 0x14 +; CHECK-NEXT: .byte 24 ; 0x18 +; CHECK-NEXT: .byte 44 ; 0x2c +; CHECK-NEXT: .byte 32 ; 0x20 +; CHECK-NEXT: .byte 36 ; 0x24 +; CHECK-NEXT: .byte 40 ; 0x28 +; CHECK-NEXT: .byte 44 ; 0x2c +; CHECK-NEXT: .byte 48 ; 0x30 +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 56 ; 0x38 +; CHECK-NEXT: .byte 60 ; 0x3c -; We need multiple tbl for the shuffle. define void @fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle(ptr %A, ptr %B, ptr %dst) { ; CHECK-LABEL: fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: Lloh4: ; CHECK-NEXT: adrp x9, lCPI3_0@PAGE -; CHECK-NEXT: Lloh5: -; CHECK-NEXT: adrp x10, lCPI3_1@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh6: +; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr q0, [x9, lCPI3_0@PAGEOFF] -; CHECK-NEXT: Lloh7: -; CHECK-NEXT: ldr q1, [x10, lCPI3_1@PAGEOFF] ; CHECK-NEXT: LBB3_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lsl x9, x8, #5 ; CHECK-NEXT: add x10, x0, x9 ; CHECK-NEXT: add x9, x1, x9 -; CHECK-NEXT: ldp q2, q3, [x10] -; CHECK-NEXT: ldp q5, q4, [x9] -; CHECK-NEXT: fcvtzu.4s v7, v3 -; CHECK-NEXT: fcvtzu.4s v6, v2 -; CHECK-NEXT: fcvtzu.4s v3, v4 -; CHECK-NEXT: fcvtzu.4s v2, v5 -; CHECK-NEXT: tbl.16b v4, { v6, v7 }, v0 -; CHECK-NEXT: tbl.16b v5, { v2, v3 }, v0 -; CHECK-NEXT: tbl.16b v2, { v4, v5 }, v1 -; CHECK-NEXT: str q2, [x2, x8, lsl #4] +; CHECK-NEXT: ldp q2, q1, [x10] +; CHECK-NEXT: ldp q4, q3, [x9] +; CHECK-NEXT: fcvtzu.4s v17, v1 +; CHECK-NEXT: fcvtzu.4s v16, v2 +; CHECK-NEXT: fcvtzu.4s v19, v3 +; CHECK-NEXT: fcvtzu.4s v18, v4 +; CHECK-NEXT: tbl.16b v1, { v16, v17, v18, v19 }, v0 +; CHECK-NEXT: str q1, [x2, x8, lsl #4] ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 ; CHECK-NEXT: b.eq LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh7 -; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh6 +; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5 entry: br label %loop @@ -269,10 +242,10 @@ define void @fptoui_v16f32_to_v16i8_in_loop(ptr %A, ptr %dst) { ; CHECK-LABEL: fptoui_v16f32_to_v16i8_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh8: +; CHECK-NEXT: Lloh6: ; CHECK-NEXT: adrp x9, lCPI4_0@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh9: +; CHECK-NEXT: Lloh7: ; CHECK-NEXT: ldr q0, [x9, lCPI4_0@PAGEOFF] ; CHECK-NEXT: LBB4_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -290,7 +263,7 @@ ; CHECK-NEXT: b.eq LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh9 +; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7 entry: br label %loop @@ -330,10 +303,10 @@ define void @fptoui_2x_v16f32_to_v16i8_in_loop(ptr %A, ptr %B, ptr %dst) { ; CHECK-LABEL: fptoui_2x_v16f32_to_v16i8_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh10: +; CHECK-NEXT: Lloh8: ; CHECK-NEXT: adrp x9, lCPI5_0@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh11: +; CHECK-NEXT: Lloh9: ; CHECK-NEXT: ldr q0, [x9, lCPI5_0@PAGEOFF] ; CHECK-NEXT: LBB5_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -361,7 +334,7 @@ ; CHECK-NEXT: b.eq LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh11 +; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh9 entry: br label %loop @@ -503,14 +476,14 @@ define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) { ; CHECK-LABEL: uitofp_v8i8_to_v8f32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh12: +; CHECK-NEXT: Lloh10: ; CHECK-NEXT: adrp x9, lCPI8_0@PAGE -; CHECK-NEXT: Lloh13: +; CHECK-NEXT: Lloh11: ; CHECK-NEXT: adrp x10, lCPI8_1@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh14: +; CHECK-NEXT: Lloh12: ; CHECK-NEXT: ldr q0, [x9, lCPI8_0@PAGEOFF] -; CHECK-NEXT: Lloh15: +; CHECK-NEXT: Lloh13: ; CHECK-NEXT: ldr q1, [x10, lCPI8_1@PAGEOFF] ; CHECK-NEXT: LBB8_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -526,8 +499,8 @@ ; CHECK-NEXT: b.eq LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh15 -; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh14 +; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh13 +; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh12 entry: br label %loop @@ -618,22 +591,22 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) { ; CHECK-LABEL: uitofp_v16i8_to_v16f32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh16: +; CHECK-NEXT: Lloh14: ; CHECK-NEXT: adrp x9, lCPI9_0@PAGE -; CHECK-NEXT: Lloh17: +; CHECK-NEXT: Lloh15: ; CHECK-NEXT: adrp x10, lCPI9_1@PAGE -; CHECK-NEXT: Lloh18: +; CHECK-NEXT: Lloh16: ; CHECK-NEXT: adrp x11, lCPI9_2@PAGE -; CHECK-NEXT: Lloh19: +; CHECK-NEXT: Lloh17: ; CHECK-NEXT: adrp x12, lCPI9_3@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh20: +; CHECK-NEXT: Lloh18: ; CHECK-NEXT: ldr q0, [x9, lCPI9_0@PAGEOFF] -; CHECK-NEXT: Lloh21: +; CHECK-NEXT: Lloh19: ; CHECK-NEXT: ldr q1, [x10, lCPI9_1@PAGEOFF] -; CHECK-NEXT: Lloh22: +; CHECK-NEXT: Lloh20: ; CHECK-NEXT: ldr q2, [x11, lCPI9_2@PAGEOFF] -; CHECK-NEXT: Lloh23: +; CHECK-NEXT: Lloh21: ; CHECK-NEXT: ldr q3, [x12, lCPI9_3@PAGEOFF] ; CHECK-NEXT: LBB9_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -654,10 +627,10 @@ ; CHECK-NEXT: b.eq LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23 -; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22 ; CHECK-NEXT: .loh AdrpLdr Lloh17, Lloh21 ; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh20 +; CHECK-NEXT: .loh AdrpLdr Lloh15, Lloh19 +; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh18 entry: br label %loop