diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4511,6 +4511,22 @@ } else { report_fatal_error("Unexpected type for AArch64 NEON intrinic"); } + } + case Intrinsic::aarch64_neon_pmull64: { + // Matches the general convention of using "v1" types to represent scalar integer operations in vector registers. + // In this way, load + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + if (Op1.getNode()->getOpcode() == ISD::LOAD && Op1.getNode()->getValueType(0) == MVT::i64) { + Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1); + } + if (Op2.getNode()->getOpcode() == ISD::LOAD && Op2.getNode()->getValueType(0) == MVT::i64) { + Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2); + } + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1, + Op2); } case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), diff --git a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+v8r,+neon,+crc,+crypto -o - %s| FileCheck %s --check-prefixes=CHECK,CHECK-SDAG + +define void @func(ptr %0, i64 %1, i64 %2) { +; CHECK-LABEL: func: +; CHECK: // %bb.0: +; CHECK-NEXT: subs x8, x2, x1 +; CHECK-NEXT: b.ls .LBB0_3 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: lsl x10, x1, #4 +; CHECK-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d0, [x9, x10] +; CHECK-NEXT: subs x8, x8, #1 +; CHECK-NEXT: ldr d1, [x9] +; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: stur q0, [x9, #-8] +; CHECK-NEXT: add x9, x9, #16 +; CHECK-NEXT: b.ne .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: ret + %4 = icmp ugt i64 %2, %1 + br i1 %4, label %5, label %7 + +5: + %6 = sub i64 %2, %1 + br label %8 + +7: + ret void + +8: + %9 = phi i64 [ %18, %8 ], [ 0, %5 ] + %10 = phi i64 [ %12, %8 ], [ %1, %5 ] + %11 = getelementptr inbounds <2 x i64>, ptr %0, i64 %9 + %12 = add nuw i64 %10, 1 + %13 = getelementptr inbounds <2 x i64>, ptr %0, i64 %9, i64 1 + %14 = load i64, ptr %13, align 8 + %15 = getelementptr inbounds <2 x i64>, ptr %0, i64 %10, i64 1 + %16 = load i64, ptr %15, align 8 + %17 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %14, i64 %16) + store <16 x i8> %17, ptr %11, align 16 + %18 = add i64 %9, 1 + %19 = icmp eq i64 %18, %6 + br i1 %19, label %7, label %8 +} + +declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-SDAG: {{.*}}