diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14783,14 +14783,88 @@ // Either node could be a DUP, but it's not worth doing both of them (you'd // just as well use the non-high version) so look for a corresponding extract // operation on the other "wing". + SDValue OldDUP; + SDValue ExtracedDUP; if (isEssentiallyExtractHighSubvector(LHS)) { + OldDUP = RHS; RHS = tryExtendDUPToExtractHigh(RHS, DAG); if (!RHS.getNode()) return SDValue(); + ExtracedDUP = RHS; } else if (isEssentiallyExtractHighSubvector(RHS)) { + OldDUP = LHS; LHS = tryExtendDUPToExtractHigh(LHS, DAG); if (!LHS.getNode()) return SDValue(); + ExtracedDUP = LHS; + } else { + return SDValue(); + } + + // After extending DUP, we can reuse the new extended DUP value + // in non-high version of long ops that have been using the old DUP value. + for (SDNode::use_iterator UI = OldDUP.getNode()->use_begin(), + UE = OldDUP.getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + unsigned UserIID = getIntrinsicID(User); + // Skip irrelevant instruction types + if (User->getOpcode() != ISD::ABDU && User->getOpcode() != ISD::ABDS && + User->getOpcode() != ISD::INTRINSIC_WO_CHAIN) { + continue; + } + if (User->getOpcode() == ISD::INTRINSIC_WO_CHAIN) { + switch (UserIID) { + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: + case Intrinsic::aarch64_neon_pmull: + case Intrinsic::aarch64_neon_sqdmull: + break; + default: + continue; + } + } + + SDLoc dl(User); + MVT NarrowTy = OldDUP.getSimpleValueType(); + SDValue LowExtracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, + ExtracedDUP.getOperand(0), + DAG.getConstant(0, dl, MVT::i64)); + + // Patch operands but skip patching when high version of long ops + // can be selected. + SDValue UserLHS = + User->getOperand((UserIID == Intrinsic::not_intrinsic) ? 0 : 1); + SDValue UserRHS = + User->getOperand((UserIID == Intrinsic::not_intrinsic) ? 1 : 2); + if (UserLHS == OldDUP) { + if (isEssentiallyExtractHighSubvector(UserRHS)) { + continue; + } + if (!UserRHS.getValueType().is64BitVector()) { + continue; + } + UserLHS = LowExtracted; + } else { + if (isEssentiallyExtractHighSubvector(UserLHS)) { + continue; + } + if (!UserLHS.getValueType().is64BitVector()) { + continue; + } + UserRHS = LowExtracted; + } + + SDValue NewNode; + if (UserIID == Intrinsic::not_intrinsic) { + NewNode = DAG.getNode(User->getOpcode(), SDLoc(User), + User->getValueType(0), UserLHS, UserRHS); + } else { + NewNode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(User), + User->getValueType(0), User->getOperand(0), UserLHS, + UserRHS); + } + DCI.CombineTo(User, NewNode); } if (IID == Intrinsic::not_intrinsic) diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-long-op-dup-noduplicate.ll b/llvm/test/CodeGen/AArch64/aarch64-combine-long-op-dup-noduplicate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-long-op-dup-noduplicate.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+neon | FileCheck %s +; When high version and low version of long operation (e.g. umull, sabdl) +; use the same vdup vector, vdup vector is extended for high version and this extended +; vector should be reused in low version to have no duplicate vdup. +; + +define <8 x i16> @test_umull_combine_constant(<16 x i8> %x) { +; CHECK-LABEL: test_umull_combine_constant: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.16b, #33 +; CHECK-NEXT: umull v1.8h, v0.8b, v2.8b +; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %1, <8 x i8> ) + %3 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %3, <8 x i8> ) + %res = sub <8 x i16> %2, %4 + ret <8 x i16> %res +} + +define <8 x i16> @test_umull_combine_vdup(<16 x i8> %x, <16 x i8> %y, <8 x i8> %coeffs) { +; CHECK-LABEL: test_umull_combine_vdup: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: dup v2.16b, v2.b[0] +; CHECK-NEXT: umull v1.8h, v0.8b, v2.8b +; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %2 = shufflevector <8 x i8> %coeffs, <8 x i8> poison, <8 x i32> zeroinitializer + %3 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %1, <8 x i8> %2) + %5 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %3, <8 x i8> %2) + %res = sub <8 x i16> %4, %5 + ret <8 x i16> %res +} + +define <8 x i16> @test_umull_combine_constant2(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: test_umull_combine_constant2: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.16b, #33 +; CHECK-NEXT: movi v3.16b, #119 +; CHECK-NEXT: umull v4.8h, v0.8b, v2.8b +; CHECK-NEXT: umull2 v5.8h, v1.16b, v3.16b +; CHECK-NEXT: umlsl v4.8h, v1.8b, v3.8b +; CHECK-NEXT: umlsl2 v5.8h, v0.16b, v2.16b +; CHECK-NEXT: add v0.8h, v4.8h, v5.8h +; CHECK-NEXT: ret + %1 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %1, <8 x i8> ) + %3 = shufflevector <16 x i8> %y, <16 x i8> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %3, <8 x i8> ) + %sub1 = sub <8 x i16> %2, %4 + %5 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %5, <8 x i8> ) + %7 = shufflevector <16 x i8> %y, <16 x i8> poison, <8 x i32> + %8 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %7, <8 x i8> ) + %sub2 = sub <8 x i16> %6, %8 + %res = sub <8 x i16> %sub1, %sub2 + ret <8 x i16> %res +} + +define <8 x i16> @test_umull_combine_vdup2(<16 x i8> %x, <16 x i8> %y, <8 x i8> %coeffs) { +; CHECK-LABEL: test_umull_combine_vdup2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: dup v3.16b, v2.b[0] +; CHECK-NEXT: dup v2.16b, v2.b[1] +; CHECK-NEXT: umull v4.8h, v0.8b, v3.8b +; CHECK-NEXT: umull2 v5.8h, v2.16b, v1.16b +; CHECK-NEXT: umlsl v4.8h, v2.8b, v1.8b +; CHECK-NEXT: umlsl2 v5.8h, v3.16b, v0.16b +; CHECK-NEXT: add v0.8h, v4.8h, v5.8h +; CHECK-NEXT: ret + %1 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %2 = shufflevector <8 x i8> %coeffs, <8 x i8> poison, <8 x i32> zeroinitializer + %3 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %1, <8 x i8> %2) + %4 = shufflevector <8 x i8> %coeffs, <8 x i8> poison, <8 x i32> + %5 = shufflevector <16 x i8> %y, <16 x i8> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %4, <8 x i8> %5) + %sub1 = sub <8 x i16> %3, %6 + %7 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %8 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %2, <8 x i8> %7) + %9 = shufflevector <16 x i8> %y, <16 x i8> poison, <8 x i32> + %10 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %4, <8 x i8> %9) + %sub2 = sub <8 x i16> %8, %10 + %res = sub <8 x i16> %sub1, %sub2 + ret <8 x i16> %res +} + +define <8 x i16> @test_vabdl_combine_vdup(<16 x i8> %x, <8 x i8> %coeffs) { +; CHECK-LABEL: test_vabdl_combine_vdup: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: dup v2.16b, v1.b[0] +; CHECK-NEXT: sabdl v1.8h, v0.8b, v2.8b +; CHECK-NEXT: sabal2 v1.8h, v0.16b, v2.16b +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %2 = shufflevector <8 x i8> %coeffs, <8 x i8> poison, <8 x i32> zeroinitializer + %3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %1, <8 x i8> %2) + %4 = zext <8 x i8> %3 to <8 x i16> + %5 = shufflevector <16 x i8> %x, <16 x i8> poison, <8 x i32> + %6 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %5, <8 x i8> %2) + %7 = zext <8 x i8> %6 to <8 x i16> + %res = add <8 x i16> %7, %4 + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)