diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4186,7 +4186,8 @@ return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } -static bool isOperandOfHigherHalf(SDValue &Op) { +// Returns true if Op means to extract higher half of vector elements. +static bool isOperandOfExtractHigherHalf(SDValue &Op) { SDNode *OpNode = Op.getNode(); if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return false; @@ -4200,10 +4201,6 @@ return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2; } -static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) { - return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2); -} - static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { EVT VT = N->getValueType(0); @@ -4545,27 +4542,45 @@ } } case Intrinsic::aarch64_neon_pmull64: { - SDValue Op1 = Op.getOperand(1); - SDValue Op2 = Op.getOperand(2); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); - // If both operands are higher half of two source SIMD & FP registers, - // ISel could make use of tablegen patterns to emit PMULL2. So do not - // legalize i64 to v1i64. - if (areOperandsOfHigherHalf(Op1, Op2)) + bool isLHSHigherHalf = isOperandOfExtractHigherHalf(LHS); + bool isRHSHigherHalf = isOperandOfExtractHigherHalf(RHS); + + // When both operands are higher half of source registers, ISel could make + // use of the following pattern to use PMULL2 directly. + // + // def : Pat<(int_aarch64_neon_pmull64 + // (extractelt (v2i64 V128:$Rn), (i64 1)), + // (extractelt (v2i64 V128:$Rm), (i64 1))), + // (PMULLv2i64 V128:$Rn, V128:$Rm)>; + if (isLHSHigherHalf && isRHSHigherHalf) return SDValue(); + // Intrinsic aarch64_neon_pmull64 is communative. + // If there is exactly one operand that extracts higher half of the vector, + // canonicalize it to the left. + if (isRHSHigherHalf && !isLHSHigherHalf) { + std::swap(LHS, RHS); + std::swap(isLHSHigherHalf, isRHSHigherHalf); + } + // As a general convention, use "v1" types to represent scalar integer - // operations in vector registers. This helps ISel to make use of - // tablegen patterns and generate a load into SIMD & FP registers directly. - if (Op1.getValueType() == MVT::i64) - Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1); - if (Op2.getValueType() == MVT::i64) - Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2); + // operations. This helps ISel to generate a load into SIMD & FP registers + // directly rather than load into GPR register followed by a mov. + // + // If the operand is an extract of higher half, use the higher half register + // as it is (i.e., not moving higher half to lower half of other registers). + if (LHS.getValueType() == MVT::i64 && !isLHSHigherHalf) + LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, LHS); + if (RHS.getValueType() == MVT::i64 && !isRHSHigherHalf) + RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, RHS); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), - DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1, - Op2); + DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), LHS, + RHS); } case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5546,6 +5546,10 @@ def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>; def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>; +def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)), + V64:$Rm), + (PMULLv2i64 V128:$Rn, (v2f64 (DUPv2i64lane (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), V64:$Rm, dsub), (i64 0))))>; + // DUP from a 64-bit register to a 64-bit register is just a copy def : Pat<(v1i64 (AArch64dup (i64 GPR64:$Rn))), (COPY_TO_REGCLASS GPR64:$Rn, FPR64)>; diff --git a/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll --- a/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll @@ -9,27 +9,24 @@ define void @test(ptr %0, ptr %1) { ; CHECK-LABEL: test: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] ; CHECK-NEXT: mov w8, #56824 ; CHECK-NEXT: mov w9, #61186 ; CHECK-NEXT: movk w8, #40522, lsl #16 ; CHECK-NEXT: movk w9, #29710, lsl #16 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: fmov d2, x9 -; CHECK-NEXT: mov x11, v1.d[1] -; CHECK-NEXT: fmov d3, x8 -; CHECK-NEXT: fmov d4, x10 -; CHECK-NEXT: pmull v0.1q, v0.1d, v2.1d -; CHECK-NEXT: fmov d5, x11 -; CHECK-NEXT: pmull v1.1q, v1.1d, v2.1d -; CHECK-NEXT: pmull v2.1q, v4.1d, v3.1d -; CHECK-NEXT: pmull v3.1q, v5.1d, v3.1d -; CHECK-NEXT: ldp q4, q5, [x0] -; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: eor v1.16b, v5.16b, v1.16b -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: pmull v4.1q, v2.1d, v1.1d +; CHECK-NEXT: pmull v1.1q, v3.1d, v1.1d +; CHECK-NEXT: pmull2 v2.1q, v2.2d, v0.2d +; CHECK-NEXT: pmull2 v0.1q, v3.2d, v0.2d +; CHECK-NEXT: ldp q3, q5, [x0] +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v5.16b, v0.16b +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %3 = load <2 x i64>, ptr %1 %4 = getelementptr inbounds <2 x i64>, ptr %1, i64 1 diff --git a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll --- a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll +++ b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll @@ -27,11 +27,10 @@ define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, v0.d[1] ; CHECK-NEXT: add x8, x0, x1, lsl #4 -; CHECK-NEXT: ldr d0, [x8, #8] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: ldr d1, [x8, #8] +; CHECK-NEXT: dup v1.2d, v1.d[0] +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 @@ -48,11 +47,10 @@ define void @test3(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, v0.d[1] ; CHECK-NEXT: add x8, x0, x1, lsl #4 -; CHECK-NEXT: ldr d0, [x8, #8] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: ldr d1, [x8, #8] +; CHECK-NEXT: dup v1.2d, v1.d[0] +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 @@ -84,10 +82,9 @@ define void @test5(ptr %0, <2 x i64> %1, i64 %2) { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: fmov d1, x1 +; CHECK-NEXT: dup v1.2d, v1.d[0] +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %4 = extractelement <2 x i64> %1, i64 1