Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14783,14 +14783,82 @@ // Either node could be a DUP, but it's not worth doing both of them (you'd // just as well use the non-high version) so look for a corresponding extract // operation on the other "wing". + SDValue OldDUP; + SDValue ExtracedDUP; if (isEssentiallyExtractHighSubvector(LHS)) { + OldDUP = RHS; RHS = tryExtendDUPToExtractHigh(RHS, DAG); - if (!RHS.getNode()) + ExtracedDUP = RHS; + if (!RHS) return SDValue(); } else if (isEssentiallyExtractHighSubvector(RHS)) { + OldDUP = LHS; LHS = tryExtendDUPToExtractHigh(LHS, DAG); - if (!LHS.getNode()) + ExtracedDUP = LHS; + if (!LHS) return SDValue(); + } else { + return SDValue(); + } + + // After extending DUP, we can reuse the new extended DUP value + // in non-high version of long ops that have been using the old DUP value. + for (SDNode::use_iterator UI = OldDUP.getNode()->use_begin(), + UE = OldDUP.getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + unsigned UserIID = getIntrinsicID(User); + // Skip irrelevant instruction types + if (User->getOpcode() != ISD::ABDU && User->getOpcode() != ISD::ABDS && + User->getOpcode() != ISD::INTRINSIC_WO_CHAIN) { + continue; + } + if (User->getOpcode() == ISD::INTRINSIC_WO_CHAIN) { + switch (UserIID) { + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: + case Intrinsic::aarch64_neon_pmull: + case Intrinsic::aarch64_neon_sqdmull: + break; + default: + continue; + } + } + + SDLoc dl(User); + MVT VT = OldDUP.getSimpleValueType(); + SDValue LowExtracted = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtracedDUP.getOperand(0), + DAG.getConstant(0, dl, MVT::i64)); + + // Patch operands but skip patching when high version of long ops + // can be selected. + SDValue UserLHS = + User->getOperand((UserIID == Intrinsic::not_intrinsic) ? 0 : 1); + SDValue UserRHS = + User->getOperand((UserIID == Intrinsic::not_intrinsic) ? 1 : 2); + if (UserLHS == OldDUP) { + if (isEssentiallyExtractHighSubvector(UserRHS) || + !UserRHS.getValueType().is64BitVector()) { + continue; + } + UserLHS = LowExtracted; + } else { + if (isEssentiallyExtractHighSubvector(UserLHS) || + !UserLHS.getValueType().is64BitVector()) { + continue; + } + UserRHS = LowExtracted; + } + + if (UserIID == Intrinsic::not_intrinsic) { + DCI.CombineTo(User, DAG.getNode(User->getOpcode(), dl, + User->getValueType(0), UserLHS, UserRHS)); + } else { + DCI.CombineTo(User, DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, + User->getValueType(0), + User->getOperand(0), UserLHS, UserRHS)); + } } if (IID == Intrinsic::not_intrinsic) Index: llvm/test/CodeGen/AArch64/aarch64-combine-long-op-dup-noduplicate.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-combine-long-op-dup-noduplicate.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-linux-gnu -mattr=+neon | FileCheck %s +; When high version and low version of long operation (e.g. umull, sabdl) +; use the same vdup vector, vdup vector is extended for high version and this extended +; vector should be reused in low version to have no duplicate vdup. +; + +define <8 x i16> @test_umull_combine_constant(<16 x i8> %x) { +; CHECK-LABEL: test_umull_combine_constant: +; CHECK: movi v2.16b, #33 +; CHECK-NEXT: umull v1.8h, v0.8b, v2.8b +; CHECK-NEXT: umlal2 v1.8h, v0.16b, v2.16b +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %lowx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %mul1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %lowx, <8 x i8> ) + %highx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %mul2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %highx, <8 x i8> ) + %res = add <8 x i16> %mul1, %mul2 + ret <8 x i16> %res +} + +define <8 x i16> @test_umull_combine_vdup(<16 x i8> %x, <16 x i8> %y, i8 %c) { +; CHECK-LABEL: test_umull_combine_vdup: +; CHECK: dup v2.16b, w0 +; CHECK-NEXT: umull v1.8h, v0.8b, v2.8b +; CHECK-NEXT: umlal2 v1.8h, v0.16b, v2.16b +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %lowx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %dup.i = insertelement <8 x i8> poison, i8 %c, i32 0 + %dup = shufflevector <8 x i8> %dup.i, <8 x i8> poison, <8 x i32> zeroinitializer + %highx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %mul1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %lowx, <8 x i8> %dup) + %mul2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %highx, <8 x i8> %dup) + %res = add <8 x i16> %mul1, %mul2 + ret <8 x i16> %res +} + +define <8 x i16> @test_vabdl_combine_vdup(<16 x i8> %x, i8 %c) { +; CHECK-LABEL: test_vabdl_combine_vdup: +; CHECK: dup v2.16b, w0 +; CHECK-NEXT: sabdl v1.8h, v0.8b, v2.8b +; CHECK-NEXT: sabal2 v1.8h, v0.16b, v2.16b +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %lowx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %dup.i = insertelement <8 x i8> poison, i8 %c, i32 0 + %dup = shufflevector <8 x i8> %dup.i, <8 x i8> poison, <8 x i32> zeroinitializer + %sabd1 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lowx, <8 x i8> %dup) + %zext1 = zext <8 x i8> %sabd1 to <8 x i16> + %highx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %sabd2 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %highx, <8 x i8> %dup) + %zext2 = zext <8 x i8> %sabd2 to <8 x i16> + %res = add <8 x i16> %zext1, %zext2 + ret <8 x i16> %res +} + +define <8 x i16> @test_umull_combine_constant2(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: test_umull_combine_constant2: +; CHECK: movi v2.16b, #33 +; CHECK-NEXT: movi v3.16b, #119 +; CHECK-NEXT: umull v4.8h, v0.8b, v2.8b +; CHECK-NEXT: umull2 v5.8h, v1.16b, v3.16b +; CHECK-NEXT: umlsl v4.8h, v1.8b, v3.8b +; CHECK-NEXT: umlsl2 v5.8h, v0.16b, v2.16b +; CHECK-NEXT: add v0.8h, v4.8h, v5.8h +; CHECK-NEXT: ret +entry: + %lowx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %mul1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %lowx, <8 x i8> ) + %lowy = shufflevector <16 x i8> %y, <16 x i8> poison, <8 x i32> + %mul2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %lowy, <8 x i8> ) + %sub1 = sub <8 x i16> %mul1, %mul2 + %highx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %mul3 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %highx, <8 x i8> ) + %highy = shufflevector <16 x i8> %y, <16 x i8> poison, <8 x i32> + %mul4 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %highy, <8 x i8> ) + %sub2 = sub <8 x i16> %mul3, %mul4 + %res = sub <8 x i16> %sub1, %sub2 + ret <8 x i16> %res +} + +define <8 x i16> @test_umull_combine_vdup2(<16 x i8> %x, <16 x i8> %y, <8 x i8> %coeffs) { +; CHECK-LABEL: test_umull_combine_vdup2: +; CHECK: dup v3.16b, v2.b[0] +; CHECK-NEXT: dup v2.16b, v2.b[1] +; CHECK-NEXT: umull v4.8h, v0.8b, v3.8b +; CHECK-NEXT: umull2 v5.8h, v1.16b, v2.16b +; CHECK-NEXT: umlsl v4.8h, v1.8b, v2.8b +; CHECK-NEXT: umlsl2 v5.8h, v0.16b, v3.16b +; CHECK-NEXT: add v0.8h, v4.8h, v5.8h +; CHECK-NEXT: ret + %lowx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %dup1 = shufflevector <8 x i8> %coeffs, <8 x i8> poison, <8 x i32> zeroinitializer + %mul1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %lowx, <8 x i8> %dup1) + %dup2 = shufflevector <8 x i8> %coeffs, <8 x i8> poison, <8 x i32> + %lowy = shufflevector <16 x i8> %y, <16 x i8> poison, <8 x i32> + %mul2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %lowy, <8 x i8> %dup2) + %sub1 = sub <8 x i16> %mul1, %mul2 + %highx = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %mul3 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %highx, <8 x i8> %dup1) + %highy = shufflevector <16 x i8> %y, <16 x i8> poison, <8 x i32> + %mul4 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %highy, <8 x i8> %dup2) + %sub2 = sub <8 x i16> %mul3, %mul4 + %res = sub <8 x i16> %sub1, %sub2 + ret <8 x i16> %res +} + + +declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)