diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4186,7 +4186,8 @@ return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } -static bool isOperandOfHigherHalf(SDValue &Op) { +// Returns true if Op means to extract higher half of vector elements. +static bool isOperandOfExtractHigherHalf(SDValue &Op) { SDNode *OpNode = Op.getNode(); if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return false; @@ -4200,10 +4201,6 @@ return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2; } -static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) { - return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2); -} - static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { EVT VT = N->getValueType(0); @@ -4545,27 +4542,52 @@ } } case Intrinsic::aarch64_neon_pmull64: { - SDValue Op1 = Op.getOperand(1); - SDValue Op2 = Op.getOperand(2); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + + bool isLHSHigherHalf = isOperandOfExtractHigherHalf(LHS); + bool isRHSHigherHalf = isOperandOfExtractHigherHalf(RHS); - // If both operands are higher half of two source SIMD & FP registers, - // ISel could make use of tablegen patterns to emit PMULL2. So do not - // legalize i64 to v1i64. - if (areOperandsOfHigherHalf(Op1, Op2)) + // When both operands are higher half of source registers, ISel could make + // use of the following pattern to use PMULL2 directly. + // + // def : Pat<(int_aarch64_neon_pmull64 + // (extractelt (v2i64 V128:$Rn), (i64 1)), + // (extractelt (v2i64 V128:$Rm), (i64 1))), + // (PMULLv2i64 V128:$Rn, V128:$Rm)>; + if (isLHSHigherHalf && isRHSHigherHalf) return SDValue(); - // As a general convention, use "v1" types to represent scalar integer - // operations in vector registers. This helps ISel to make use of - // tablegen patterns and generate a load into SIMD & FP registers directly. - if (Op1.getValueType() == MVT::i64) - Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1); - if (Op2.getValueType() == MVT::i64) - Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2); + // Intrinsic aarch64_neon_pmull64 is communative. + // If there is exactly one operand that extracts higher half of the vector, + // canonicalize it to the left to make use of the following pattern. + // + // def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 + // 1)), + // GPR64:$Rm), + // (PMULLv2i64 V128:$Rn, (v2i64 (DUPv2i64gpr GPR64:$Rm)))>; + if (isRHSHigherHalf && !isLHSHigherHalf) { + std::swap(LHS, RHS); + std::swap(isLHSHigherHalf, isRHSHigherHalf); + } + + // If neither operand is an extraction of higher half, just use the non-high + // version of instruction (i.e., PMULLv1i64). + const bool pmullOverLowerHalf = (!isLHSHigherHalf); + + // If lower-half version of instruction is used , use "v1" types to + // represent scalar integer operations. This helps ISel to generate a load + // into SIMD & FP registers directly rather than load into GPR register + // followed by a mov. + if (LHS.getValueType() == MVT::i64 && pmullOverLowerHalf) + LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, LHS); + if (RHS.getValueType() == MVT::i64 && pmullOverLowerHalf) + RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, RHS); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), - DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1, - Op2); + DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), LHS, + RHS); } case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5546,6 +5546,10 @@ def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>; def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>; +def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)), + GPR64:$Rm), + (PMULLv2i64 V128:$Rn, (v2i64 (DUPv2i64gpr GPR64:$Rm)))>; + // DUP from a 64-bit register to a 64-bit register is just a copy def : Pat<(v1i64 (AArch64dup (i64 GPR64:$Rn))), (COPY_TO_REGCLASS GPR64:$Rn, FPR64)>; diff --git a/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK + +; Test that PMULL2 are codegen'ed when only one (of two) operands +; are in higher-half register already. +; +; This is a big win (saves multiple moves) since user code intends to execute {pmull, pmull2} instruction +; on {lower, higher} half of the same SIMD register. +define void @test(ptr %0, ptr %1) { +; CHECK-LABEL: test: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #61186 +; CHECK-NEXT: mov w8, #56824 +; CHECK-NEXT: movk w9, #29710, lsl #16 +; CHECK-NEXT: movk w8, #40522, lsl #16 +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: pmull2 v4.1q, v0.2d, v1.2d +; CHECK-NEXT: pmull v0.1q, v0.1d, v3.1d +; CHECK-NEXT: pmull2 v1.1q, v2.2d, v1.2d +; CHECK-NEXT: pmull v2.1q, v2.1d, v3.1d +; CHECK-NEXT: ldp q3, q5, [x0] +; CHECK-NEXT: eor v0.16b, v0.16b, v4.16b +; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b +; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b +; CHECK-NEXT: eor v1.16b, v5.16b, v1.16b +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ret + %3 = load <2 x i64>, ptr %1 + %4 = getelementptr inbounds <2 x i64>, ptr %1, i64 1 + %5 = load <2 x i64>, ptr %4 + %6 = extractelement <2 x i64> %3, i64 1 + %7 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 2655706616) + %8 = extractelement <2 x i64> %5, i64 1 + %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 2655706616) + %10 = load <2 x i64>, ptr %0 + %11 = getelementptr inbounds i8, ptr %0, i64 16 + %12 = load <2 x i64>, ptr %11 + %13 = extractelement <2 x i64> %3, i64 0 + %14 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %13, i64 1947135746) + %15 = extractelement <2 x i64> %5, i64 0 + %16 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %15, i64 1947135746) + %17 = xor <16 x i8> %14, %7 + %18 = bitcast <16 x i8> %17 to <2 x i64> + %19 = xor <16 x i8> %16, %9 + %20 = bitcast <16 x i8> %19 to <2 x i64> + %21 = xor <2 x i64> %10, %18 + %22 = xor <2 x i64> %12, %20 + store <2 x i64> %21, ptr %1 + store <2 x i64> %22, ptr %4 + ret void +} + +declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) diff --git a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll --- a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll +++ b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll @@ -27,26 +27,45 @@ define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, v0.d[1] ; CHECK-NEXT: add x8, x0, x1, lsl #4 -; CHECK-NEXT: ldr d0, [x8, #8] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: ldr x9, [x8, #8] +; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 %6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 %7 = load i64, ptr %6, align 8 %8 = extractelement <2 x i64> %3, i64 1 - %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %8) + %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 %7) store <16 x i8> %9, ptr %5, align 16 ret void } -define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) { +; test3 clones test2, but swaps lhs with rhs, to test that non-extract +; operand will be canonicalized to the rhs. +define void @test3(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x0, x1, lsl #4 +; CHECK-NEXT: ldr x9, [x8, #8] +; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d +; CHECK-NEXT: str q0, [x8] +; CHECK-NEXT: ret + %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 + %6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 + %7 = load i64, ptr %6, align 8 + %8 = extractelement <2 x i64> %3, i64 1 + %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %8) + store <16 x i8> %9, ptr %5, align 16 + ret void +} + +define void @test4(ptr %0, i64 %1, i64 %2, i64 %3) { +; CHECK-LABEL: test4: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1, lsl #4 ; CHECK-NEXT: fmov d0, x3 ; CHECK-NEXT: ldr d1, [x8, #8] ; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d @@ -60,4 +79,17 @@ ret void } +define void @test5(ptr %0, <2 x i64> %1, i64 %2) { +; CHECK-LABEL: test5: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v1.2d, x1 +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %4 = extractelement <2 x i64> %1, i64 1 + %5 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %2) + store <16 x i8> %5, ptr %0, align 16 + ret void +} + declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)