diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4186,6 +4186,24 @@ return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } +static bool isOperandOfHigherHalf(SDValue &Op) { + SDNode *OpNode = Op.getNode(); + if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + ConstantSDNode *C = dyn_cast(OpNode->getOperand(1)); + if (!C || C->getZExtValue() != 1) + return false; + + EVT VT = OpNode->getOperand(0).getValueType(); + + return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2; +} + +static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) { + return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2); +} + static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { EVT VT = N->getValueType(0); @@ -4526,6 +4544,29 @@ report_fatal_error("Unexpected type for AArch64 NEON intrinic"); } } + case Intrinsic::aarch64_neon_pmull64: { + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + + // If both operands are higher half of two source SIMD & FP registers, + // ISel could make use of tablegen patterns to emit PMULL2. So do not + // legalize i64 to v1i64. + if (areOperandsOfHigherHalf(Op1, Op2)) + return SDValue(); + + // As a general convention, use "v1" types to represent scalar integer + // operations in vector registers. This helps ISel to make use of + // tablegen patterns and generate a load into SIMD & FP registers directly. + if (Op1.getValueType() == MVT::i64) + Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1); + if (Op2.getValueType() == MVT::i64) + Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2); + + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1, + Op2); + } case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -2507,9 +2507,9 @@ define i128 @test_vmull_p64(i64 %a, i64 %b) #4 { ; CHECK-LABEL: test_vmull_p64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: fmov d1, x1 -; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: fmov d0, x1 +; CHECK-NEXT: fmov d1, x0 +; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d ; CHECK-NEXT: mov x1, v0.d[1] ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -2925,9 +2925,9 @@ define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind { ; CHECK-LABEL: test_pmull_64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: fmov d1, x1 -; CHECK-NEXT: pmull.1q v0, v0, v1 +; CHECK-NEXT: fmov d0, x1 +; CHECK-NEXT: fmov d1, x0 +; CHECK-NEXT: pmull.1q v0, v1, v0 ; CHECK-NEXT: ret %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r) ret <16 x i8> %val diff --git a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK + +; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are +; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov. + +define void @test1(ptr %0, i64 %1, i64 %2) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x2, lsl #4 +; CHECK-NEXT: add x9, x0, x1, lsl #4 +; CHECK-NEXT: ldr d0, [x8, #8] +; CHECK-NEXT: ldr d1, [x9, #8] +; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: str q0, [x9] +; CHECK-NEXT: ret + %4 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 + %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 + %6 = load i64, ptr %5, align 8 + %7 = getelementptr inbounds <2 x i64>, ptr %0, i64 %2, i64 1 + %8 = load i64, ptr %7, align 8 + %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 %8) + store <16 x i8> %9, ptr %4, align 16 + ret void +} + +define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: add x8, x0, x1, lsl #4 +; CHECK-NEXT: ldr d0, [x8, #8] +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: str q0, [x8] +; CHECK-NEXT: ret + %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 + %6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 + %7 = load i64, ptr %6, align 8 + %8 = extractelement <2 x i64> %3, i64 1 + %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %8) + store <16 x i8> %9, ptr %5, align 16 + ret void +} + +define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) { +; CHECK-LABEL: test3: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1, lsl #4 +; CHECK-NEXT: fmov d0, x3 +; CHECK-NEXT: ldr d1, [x8, #8] +; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: str q0, [x8] +; CHECK-NEXT: ret + %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 + %6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 + %7 = load i64, ptr %6, align 8 + %8 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %3) + store <16 x i8> %8, ptr %5, align 16 + ret void +} + +declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)