Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11513,6 +11513,138 @@ Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask}); } +static SDValue tryHandleMaskWithSplats(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + int NumElts = VT.getVectorNumElements(); + ArrayRef Mask = cast(Op)->getMask(); + + // DifferentLaneMap keeps on mask. + SmallMapVector DifferentLaneMap; + int ConsecutiveLaneCount = 0; + int PrevLane = -1; + for (int i = 0; i < NumElts; ++i) { + int Lane = Mask[i]; + if (PrevLane != Lane) { + ConsecutiveLaneCount = 0; + PrevLane = Lane; + } + + if (Lane == UndefMaskElem) + return SDValue(); + + // Keep different lane and its consecutive count. + DifferentLaneMap[Lane] = ++ConsecutiveLaneCount; + } + + int NumDifferentLanes = DifferentLaneMap.size(); + if (NumDifferentLanes == 2) { + SmallVector, 2> Lanes; + bool canUseVECTOR_CONCAT = true; + for (auto Pair : DifferentLaneMap) { + // Check all different lanes have same length. + if (Pair.second != NumElts / NumDifferentLanes) + canUseVECTOR_CONCAT = false; + // Keep source vector and its lane. + SDValue SrcVec = Pair.first < NumElts ? V0 : V1; + int Lane = Pair.first < NumElts ? Pair.first : Pair.first - NumElts; + Lanes.push_back( + std::make_pair(SrcVec, DAG.getConstant(Lane, dl, MVT::i64))); + } + + // If the mask consists of two splats which have same length, try to + // generate DUPs and concat_vectors. For example, + // + // t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0 + // t4: v8i16,ch = CopyFromReg t0, Register:v8i16 %1 + // t5: v8i16 = vector_shuffle<0,0,0,0,8,8,8,8> t2, t4 + // ==> + // t2: v8i16,ch = CopyFromReg t0, Register:v8i16 %0 + // t12: v4i16 = AArch64ISD::DUPLANE16 t2, Constant:i64<0> + // t4: v8i16,ch = CopyFromReg t0, Register:v8i16 %1 + // t13: v4i16 = AArch64ISD::DUPLANE16 t4, Constant:i64<0> + // t14: v8i16 = concat_vectors t12, t13 + if (canUseVECTOR_CONCAT) { + EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + if (DAG.getTargetLoweringInfo().isTypeLegal(SubVT) && SubVT.isVector() && + SubVT.getVectorNumElements() >= 4) { + unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); + SDValue DUP1 = + DAG.getNode(Opcode, dl, SubVT, Lanes[0].first, Lanes[0].second); + SDValue DUP2 = + DAG.getNode(Opcode, dl, SubVT, Lanes[1].first, Lanes[1].second); + SDValue CONCAT_VECTORS = + DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2); + return CONCAT_VECTORS; + } + } + } + + // From here, DifferentLaneMap keeps on mask. + DifferentLaneMap.clear(); + + for (int i = 0; i < NumElts; ++i) { + int Lane = Mask[i]; + if (Lane == UndefMaskElem) + return SDValue(); + ++DifferentLaneMap[Lane]; + } + + int DUPCandidateLane = -1; + int DUPLaneCount = 0; + for (auto Pair : DifferentLaneMap) { + if (Pair.second > DUPLaneCount) { + DUPCandidateLane = Pair.first; + DUPLaneCount = Pair.second; + } + } + + // Let's try to generate DUP and ins. For example, + // + // t2: v8f16,ch = CopyFromReg t0, Register:v8f16 %0 + // t4: v8f16,ch = CopyFromReg t0, Register:v8f16 %1 + // t5: v8f16 = vector_shuffle<0,0,0,0,0,8,1,15> t2, t4 + // ==> + // t2: v8f16,ch = CopyFromReg t0, Register:v8f16 %0 + // t4: v8f16,ch = CopyFromReg t0, Register:v8f16 %1 + // t12: f16 = extract_vector_elt t2, Constant:i64<0> + // t13: v8f16 = AArch64ISD::DUP t12 + // t14: f16 = extract_vector_elt t4, Constant:i64<0> + // t16: v8f16 = insert_vector_elt t13, t14, Constant:i64<5> + // t18: f16 = extract_vector_elt t2, Constant:i64<1> + // t20: v8f16 = insert_vector_elt t16, t18, Constant:i64<6> + // t22: f16 = extract_vector_elt t4, Constant:i64<7> + // t23: v8f16 = insert_vector_elt t20, t22, Constant:i64<7> + if (DUPLaneCount > NumElts / 2 && + DAG.getTargetLoweringInfo().isTypeLegal(VT.getVectorElementType())) { + // Create DUP. + int SrcLane = (DUPCandidateLane >= NumElts) ? DUPCandidateLane - NumElts + : DUPCandidateLane; + SDValue SrcVec = (DUPCandidateLane >= NumElts) ? V1 : V0; + SDValue SrcElt = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), + SrcVec, DAG.getConstant(SrcLane, dl, MVT::i64)); + SDValue NewVec = DAG.getNode(AArch64ISD::DUP, dl, VT, SrcElt); + // Create ins. + for (int i = 0; i < NumElts; ++i) { + if (Mask[i] != DUPCandidateLane) { + SrcLane = (Mask[i] >= NumElts) ? Mask[i] - NumElts : Mask[i]; + SDValue SrcVec = (Mask[i] >= NumElts) ? V1 : V0; + SDValue SrcElt = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), + SrcVec, DAG.getConstant(SrcLane, dl, MVT::i64)); + NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVec, SrcElt, + DAG.getConstant(i, dl, MVT::i64)); + } + } + return NewVec; + } + + return SDValue(); +} + // Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros, // but we don't have an appropriate instruction, // so custom-lower it as ZIP1-with-zeros. @@ -11708,6 +11840,9 @@ dl); } + if (SDValue NewSD = tryHandleMaskWithSplats(Op, DAG)) + return NewSD; + return GenerateTBL(Op, ShuffleMask, DAG); } Index: llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll +++ llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll @@ -1,35 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s ; rdar://13214163 - Make sure we generate a correct lookup table for the TBL ; instruction when the element size of the vector is not 8 bits. We were ; getting both the endianness wrong and the element indexing wrong. define <8 x i16> @foo(<8 x i16> %a) nounwind readnone { -; CHECK: .section __TEXT,__literal16,16byte_literals -; CHECK: .p2align 4 -; CHECK:lCPI0_0: -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 1 ; 0x1 -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 1 ; 0x1 -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 1 ; 0x1 -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 1 ; 0x1 -; CHECK: .byte 8 ; 0x8 -; CHECK: .byte 9 ; 0x9 -; CHECK: .byte 8 ; 0x8 -; CHECK: .byte 9 ; 0x9 -; CHECK: .byte 8 ; 0x8 -; CHECK: .byte 9 ; 0x9 -; CHECK: .byte 8 ; 0x8 -; CHECK: .byte 9 ; 0x9 -; CHECK: .section __TEXT,__text,regular,pure_instructions -; CHECK: .globl _foo -; CHECK: .p2align 2 -; CHECK:_foo: ; @foo -; CHECK: adrp [[BASE:x[0-9]+]], lCPI0_0@PAGE -; CHECK: ldr q[[REG:[0-9]+]], [[[BASE]], lCPI0_0@PAGEOFF] -; CHECK: tbl.16b v0, { v0 }, v[[REG]] -; CHECK: ret +; CHECK-LABEL: foo: +; CHECK: ; %bb.0: +; CHECK-NEXT: dup.4h v1, v0[4] +; CHECK-NEXT: dup.4h v0, v0[0] +; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: ret %val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> ret <8 x i16> %val Index: llvm/test/CodeGen/AArch64/shuffles.ll =================================================================== --- llvm/test/CodeGen/AArch64/shuffles.ll +++ llvm/test/CodeGen/AArch64/shuffles.ll @@ -170,11 +170,9 @@ define <8 x i16> @test_shuf9(<8 x i16> %a, <8 x i16> %b) ; CHECK-LABEL: test_shuf9: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: dup v1.4h, v1.h[0] +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret { %r = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> @@ -184,9 +182,9 @@ define <16 x i8> @test_shuf10(<16 x i8> %a, <16 x i8> %b) ; CHECK-LABEL: test_shuf10: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-NEXT: dup v1.8b, v0.b[8] +; CHECK-NEXT: dup v0.8b, v0.b[0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret { %r = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -196,11 +194,9 @@ define <8 x half> @test_shuf11(<8 x half> %a, <8 x half> %b) ; CHECK-LABEL: test_shuf11: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: dup v1.4h, v1.h[0] +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret { %r = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> @@ -210,11 +206,11 @@ define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b) ; CHECK-LABEL: test_shuf12: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: dup v2.8h, v0.h[0] +; CHECK-NEXT: mov v2.h[5], v1.h[0] +; CHECK-NEXT: mov v2.h[6], v0.h[1] +; CHECK-NEXT: mov v2.h[7], v1.h[7] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret { %r = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32>