Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4186,7 +4186,7 @@ return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } -static bool isOperandOfHigherHalf(SDValue &Op) { +static bool isOperandOfExtractHigherHalf(SDValue &Op) { SDNode *OpNode = Op.getNode(); if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return false; @@ -4200,10 +4200,6 @@ return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2; } -static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) { - return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2); -} - static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { EVT VT = N->getValueType(0); @@ -4545,27 +4541,52 @@ } } case Intrinsic::aarch64_neon_pmull64: { - SDValue Op1 = Op.getOperand(1); - SDValue Op2 = Op.getOperand(2); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + + bool isLHSHigherHalf = isOperandOfExtractHigherHalf(LHS); + bool isRHSHigherHalf = isOperandOfExtractHigherHalf(RHS); - // If both operands are higher half of two source SIMD & FP registers, - // ISel could make use of tablegen patterns to emit PMULL2. So do not - // legalize i64 to v1i64. - if (areOperandsOfHigherHalf(Op1, Op2)) + // When both operands are higher half of source registers, ISel could make + // use of the following pattern to use PMULL2 directly. + // + // def : Pat<(int_aarch64_neon_pmull64 + // (extractelt (v2i64 V128:$Rn), (i64 1)), + // (extractelt (v2i64 V128:$Rm), (i64 1))), + // (PMULLv2i64 V128:$Rn, V128:$Rm)>; + if (isLHSHigherHalf && isRHSHigherHalf) return SDValue(); - // As a general convention, use "v1" types to represent scalar integer - // operations in vector registers. This helps ISel to make use of - // tablegen patterns and generate a load into SIMD & FP registers directly. - if (Op1.getValueType() == MVT::i64) - Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1); - if (Op2.getValueType() == MVT::i64) - Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2); + // Intrinsic aarch64_neon_pmull64 is communative. + // If there is exactly one operand that extracts higher half of the vector, + // canonicalize it to the left. + if (isRHSHigherHalf && !isLHSHigherHalf) { + std::swap(LHS, RHS); + std::swap(isLHSHigherHalf, isRHSHigherHalf); + } + + // As a general convention, vectorize scalar operands. This helps ISel to + // make use of tablegen patterns (for example, generate a load into SIMD & + // FP registers directly). + if (isLHSHigherHalf) { + assert(!isRHSHigherHalf && "Expect only one operand to be higher half"); + RHS = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, + DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, RHS), + DAG.getConstant(0, dl, MVT::i64)), + DAG.getConstant(1, dl, MVT::i64)); + } else { + if (LHS.getValueType() == MVT::i64) + LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, LHS); + if (RHS.getValueType() == MVT::i64) + RHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, RHS); + } return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), - DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1, - Op2); + DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), LHS, + RHS); } case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5598,6 +5598,10 @@ def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)), (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>; + +def : Pat<(v2i64 (AArch64duplane64 (v2i64 (scalar_to_vector (i64 GPR64:$Rn))), (i64 0))), + (v2i64 (DUPv2i64gpr GPR64:$Rn))>; + // If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane // instruction even if the types don't match: we just have to remap the lane // carefully. N.b. this trick only applies to truncations. Index: llvm/test/CodeGen/AArch64/aarch64-pmull2.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-pmull2.ll +++ llvm/test/CodeGen/AArch64/aarch64-pmull2.ll @@ -1,33 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK -; Test that PMULL2 are codegen'ed when only one (of two) operands -; are in higher-half register already. -; -; Codegen is more efficient by getting rid of unnecessary moves across lanes, when user code intends to execute {pmull, pmull2} instruction -; on {lower, higher} half of the same SIMD register. +; User code intends to execute {pmull, pmull2} instructions on {lower, higher} half of the same vector registers directly. +; Test that PMULL2 are generated for higher-half operands. +; The suboptimal code generation fails to use higher-half contents in place; instead, it moves higher-lane contents to lower lane +; to make use of PMULL everywhere, and generates unnecessary moves. define void @test1(ptr %0, ptr %1) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: mov w8, #56824 ; CHECK-NEXT: mov w9, #61186 -; CHECK-NEXT: movk w8, #40522, lsl #16 +; CHECK-NEXT: mov w8, #56824 ; CHECK-NEXT: movk w9, #29710, lsl #16 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: fmov d2, x9 -; CHECK-NEXT: mov x11, v1.d[1] -; CHECK-NEXT: fmov d3, x8 -; CHECK-NEXT: fmov d4, x10 -; CHECK-NEXT: pmull v0.1q, v0.1d, v2.1d -; CHECK-NEXT: fmov d5, x11 -; CHECK-NEXT: pmull v1.1q, v1.1d, v2.1d -; CHECK-NEXT: pmull v2.1q, v4.1d, v3.1d -; CHECK-NEXT: pmull v3.1q, v5.1d, v3.1d -; CHECK-NEXT: ldp q4, q5, [x0] -; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: movk w8, #40522, lsl #16 +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: pmull2 v4.1q, v0.2d, v1.2d +; CHECK-NEXT: pmull v0.1q, v0.1d, v3.1d +; CHECK-NEXT: pmull2 v1.1q, v2.2d, v1.2d +; CHECK-NEXT: pmull v2.1q, v2.1d, v3.1d +; CHECK-NEXT: ldp q3, q5, [x0] +; CHECK-NEXT: eor v0.16b, v0.16b, v4.16b +; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b +; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b ; CHECK-NEXT: eor v1.16b, v5.16b, v1.16b ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret @@ -56,12 +51,13 @@ ret void } +; One operand is higher-half of SIMD register, and the other operand is lower-half of another SIMD register. +; Tests that codegen doesn't generate unnecessary moves. define void @test2(ptr %0, <2 x i64> %1, <2 x i64> %2) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: dup v1.2d, v1.d[0] +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %4 = extractelement <2 x i64> %1, i64 1 Index: llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll =================================================================== --- llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll +++ llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll @@ -1,9 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK -; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are -; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov. - +; Two operands are in scalar form. +; Tests that both operands are loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov. define void @test1(ptr %0, i64 %1, i64 %2) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: @@ -24,14 +23,15 @@ ret void } +; Operand %8 is higher-half of v2i64, and operand %7 is a scalar load. +; Tests that operand is loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov. define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, v0.d[1] ; CHECK-NEXT: add x8, x0, x1, lsl #4 -; CHECK-NEXT: ldr d0, [x8, #8] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: ldr x9, [x8, #8] +; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 @@ -48,11 +48,10 @@ define void @test3(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, v0.d[1] ; CHECK-NEXT: add x8, x0, x1, lsl #4 -; CHECK-NEXT: ldr d0, [x8, #8] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: ldr x9, [x8, #8] +; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 @@ -64,6 +63,8 @@ ret void } +; Operand %7 is a scalar load, and operand %3 is an input parameter of `test4`. +; Test that %7 is loaded into SIMD registers, and %3 is fmov'ed, for optimal codegen. define void @test4(ptr %0, i64 %1, i64 %2, i64 %3) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: @@ -81,13 +82,13 @@ ret void } +; Operand %4 is the higher-half of v2i64, and operand %2 is an input parameter of i64. +; Test that %2 is duplicated into the proper lane of SIMD directly for optimal codegen. define void @test5(ptr %0, <2 x i64> %1, i64 %2) { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: dup v1.2d, x1 +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %4 = extractelement <2 x i64> %1, i64 1