diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5546,6 +5546,10 @@ def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>; def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>; +def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)), + V64:$Rm), + (PMULLv2i64 V128:$Rn, (v2f64 (DUPv2i64lane (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), V64:$Rm, dsub), (i64 0))))>; + // DUP from a 64-bit register to a 64-bit register is just a copy def : Pat<(v1i64 (AArch64dup (i64 GPR64:$Rn))), (COPY_TO_REGCLASS GPR64:$Rn, FPR64)>; diff --git a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll --- a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll +++ b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK +; RUN: llc -O3 -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK ; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are ; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov. @@ -60,4 +60,36 @@ ret void } +define void @test4(ptr %0, ptr %1) { +; CHECK-LABEL: test4: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: mov w9, #56824 +; CHECK-NEXT: mov w10, #61186 +; CHECK-NEXT: movk w9, #40522, lsl #16 +; CHECK-NEXT: movk w10, #29710, lsl #16 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: fmov d3, x10 +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: pmull v0.1q, v0.1d, v3.1d +; CHECK-NEXT: pmull v1.1q, v2.1d, v1.1d +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret + %3 = load <2 x i64>, ptr %1 + %4 = extractelement <2 x i64> %3, i64 1 + %5 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 2655706616) + %6 = load <2 x i64>, ptr %0 + %7 = extractelement <2 x i64> %3, i64 0 + %8 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 1947135746) + %9 = xor <16 x i8> %8, %5 + %10 = bitcast <16 x i8> %9 to <2 x i64> + %11 = xor <2 x i64> %6, %10 + store <2 x i64> %11, ptr %1 + ret void +} + declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)