Index: llvm/test/CodeGen/AArch64/aarch64-pmull2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-pmull2.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK + +; Test that PMULL2 are codegen'ed when only one (of two) operands +; are in higher-half register already. +; +; Codegen is more efficient by getting rid of unnecessary moves across lanes, when user code intends to execute {pmull, pmull2} instruction +; on {lower, higher} half of the same SIMD register. +define void @test1(ptr %0, ptr %1) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: mov w8, #56824 +; CHECK-NEXT: mov w9, #61186 +; CHECK-NEXT: movk w8, #40522, lsl #16 +; CHECK-NEXT: movk w9, #29710, lsl #16 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: fmov d2, x9 +; CHECK-NEXT: mov x11, v1.d[1] +; CHECK-NEXT: fmov d3, x8 +; CHECK-NEXT: fmov d4, x10 +; CHECK-NEXT: pmull v0.1q, v0.1d, v2.1d +; CHECK-NEXT: fmov d5, x11 +; CHECK-NEXT: pmull v1.1q, v1.1d, v2.1d +; CHECK-NEXT: pmull v2.1q, v4.1d, v3.1d +; CHECK-NEXT: pmull v3.1q, v5.1d, v3.1d +; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: eor v1.16b, v5.16b, v1.16b +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %3 = load <2 x i64>, ptr %1 + %4 = getelementptr inbounds <2 x i64>, ptr %1, i64 1 + %5 = load <2 x i64>, ptr %4 + %6 = extractelement <2 x i64> %3, i64 1 + %7 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 2655706616) + %8 = extractelement <2 x i64> %5, i64 1 + %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 2655706616) + %10 = load <2 x i64>, ptr %0 + %11 = getelementptr inbounds i8, ptr %0, i64 16 + %12 = load <2 x i64>, ptr %11 + %13 = extractelement <2 x i64> %3, i64 0 + %14 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %13, i64 1947135746) + %15 = extractelement <2 x i64> %5, i64 0 + %16 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %15, i64 1947135746) + %17 = xor <16 x i8> %14, %7 + %18 = bitcast <16 x i8> %17 to <2 x i64> + %19 = xor <16 x i8> %16, %9 + %20 = bitcast <16 x i8> %19 to <2 x i64> + %21 = xor <2 x i64> %10, %18 + %22 = xor <2 x i64> %12, %20 + store <2 x i64> %21, ptr %1 + store <2 x i64> %22, ptr %4 + ret void +} + +define void @test2(ptr %0, <2 x i64> %1, <2 x i64> %2) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %4 = extractelement <2 x i64> %1, i64 1 + %5 = extractelement <2 x i64> %2, i64 0 + %6 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %5) + store <16 x i8> %6, ptr %0, align 16 + ret void +} + +declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) Index: llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll =================================================================== --- llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll +++ llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll @@ -31,6 +31,27 @@ ; CHECK-NEXT: add x8, x0, x1, lsl #4 ; CHECK-NEXT: ldr d0, [x8, #8] ; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: str q0, [x8] +; CHECK-NEXT: ret + %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 + %6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 + %7 = load i64, ptr %6, align 8 + %8 = extractelement <2 x i64> %3, i64 1 + %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 %7) + store <16 x i8> %9, ptr %5, align 16 + ret void +} + +; test3 clones test2, but swaps lhs with rhs, to test that non-extract +; operand will be canonicalized to the rhs. +define void @test3(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { +; CHECK-LABEL: test3: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: add x8, x0, x1, lsl #4 +; CHECK-NEXT: ldr d0, [x8, #8] +; CHECK-NEXT: fmov d1, x9 ; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret @@ -43,8 +64,8 @@ ret void } -define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) { -; CHECK-LABEL: test3: +define void @test4(ptr %0, i64 %1, i64 %2, i64 %3) { +; CHECK-LABEL: test4: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x0, x1, lsl #4 ; CHECK-NEXT: fmov d0, x3 @@ -60,4 +81,19 @@ ret void } +define void @test5(ptr %0, <2 x i64> %1, i64 %2) { +; CHECK-LABEL: test5: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov d0, x1 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %4 = extractelement <2 x i64> %1, i64 1 + %5 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %2) + store <16 x i8> %5, ptr %0, align 16 + ret void +} + declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)