diff --git a/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK + +; User code intends to execute {pmull, pmull2} instructions on {lower, higher} half of the same vector registers directly. +; Test that PMULL2 are generated for higher-half operands. +; The suboptimal code generation fails to use higher-half contents in place; instead, it moves higher-lane contents to lower lane +; to make use of PMULL everywhere, and generates unnecessary moves. +define void @test1(ptr %0, ptr %1) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: mov w8, #56824 +; CHECK-NEXT: mov w9, #61186 +; CHECK-NEXT: movk w8, #40522, lsl #16 +; CHECK-NEXT: movk w9, #29710, lsl #16 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: fmov d2, x9 +; CHECK-NEXT: mov x11, v1.d[1] +; CHECK-NEXT: fmov d3, x8 +; CHECK-NEXT: fmov d4, x10 +; CHECK-NEXT: pmull v0.1q, v0.1d, v2.1d +; CHECK-NEXT: fmov d5, x11 +; CHECK-NEXT: pmull v1.1q, v1.1d, v2.1d +; CHECK-NEXT: pmull v2.1q, v4.1d, v3.1d +; CHECK-NEXT: pmull v3.1q, v5.1d, v3.1d +; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %3 = load <2 x i64>, ptr %1 + %4 = getelementptr inbounds <2 x i64>, ptr %1, i64 1 + %5 = load <2 x i64>, ptr %4 + %6 = extractelement <2 x i64> %3, i64 1 + %7 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 2655706616) + %8 = extractelement <2 x i64> %5, i64 1 + %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 2655706616) + %10 = load <2 x i64>, ptr %0 + %11 = getelementptr inbounds i8, ptr %0, i64 16 + %12 = load <2 x i64>, ptr %11 + %13 = extractelement <2 x i64> %3, i64 0 + %14 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %13, i64 1947135746) + %15 = extractelement <2 x i64> %5, i64 0 + %16 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %15, i64 1947135746) + %17 = xor <16 x i8> %14, %7 + %18 = xor <16 x i8> %16, %9 + store <16 x i8> %17, ptr %1 + store <16 x i8> %18, ptr %4 + ret void +} + +; One operand is higher-half of SIMD register, and the other operand is lower-half of another SIMD register. +; Tests that codegen doesn't generate unnecessary moves. +define void @test2(ptr %0, <2 x i64> %1, <2 x i64> %2) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %4 = extractelement <2 x i64> %1, i64 1 + %5 = extractelement <2 x i64> %2, i64 0 + %6 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %5) + store <16 x i8> %6, ptr %0, align 16 + ret void +} + +declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) diff --git a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll --- a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll +++ b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll @@ -1,9 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK -; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are -; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov. - +; Two operands are in scalar form. +; Tests that both operands are loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov. define void @test1(ptr %0, i64 %1, i64 %2) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: @@ -12,7 +11,7 @@ ; CHECK-NEXT: ldr d0, [x8, #8] ; CHECK-NEXT: ldr d1, [x9, #8] ; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d -; CHECK-NEXT: str q0, [x9] +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %4 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 @@ -20,10 +19,12 @@ %7 = getelementptr inbounds <2 x i64>, ptr %0, i64 %2, i64 1 %8 = load i64, ptr %7, align 8 %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 %8) - store <16 x i8> %9, ptr %4, align 16 + store <16 x i8> %9, ptr %0, align 16 ret void } +; Operand %8 is higher-half of v2i64, and operand %7 is a scalar load. +; Tests that operand is loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov. define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: @@ -31,18 +32,20 @@ ; CHECK-NEXT: add x8, x0, x1, lsl #4 ; CHECK-NEXT: ldr d0, [x8, #8] ; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 %6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 %7 = load i64, ptr %6, align 8 %8 = extractelement <2 x i64> %3, i64 1 - %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %8) + %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 %7) store <16 x i8> %9, ptr %5, align 16 ret void } +; Operand %7 is a scalar load, and operand %3 is an input parameter of function `test4`. +; Test that %7 is loaded into SIMD registers. define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: @@ -60,4 +63,21 @@ ret void } +; Operand %4 is the higher-half of v2i64, and operand %2 is an input parameter of i64. +; Test that %2 is duplicated into the proper lane of SIMD directly for optimal codegen. +define void @test4(ptr %0, <2 x i64> %1, i64 %2) { +; CHECK-LABEL: test4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov d0, x1 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %4 = extractelement <2 x i64> %1, i64 1 + %5 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %2) + store <16 x i8> %5, ptr %0, align 16 + ret void +} + declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)