Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -291,6 +291,8 @@ SMULL, UMULL, + PMULL, + // Reciprocal estimates and steps. FRECPE, FRECPS, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2243,6 +2243,7 @@ MAKE_CASE(AArch64ISD::ST4LANEpost) MAKE_CASE(AArch64ISD::SMULL) MAKE_CASE(AArch64ISD::UMULL) + MAKE_CASE(AArch64ISD::PMULL) MAKE_CASE(AArch64ISD::FRECPE) MAKE_CASE(AArch64ISD::FRECPS) MAKE_CASE(AArch64ISD::FRSQRTE) @@ -4186,7 +4187,7 @@ return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } -static bool isOperandOfHigherHalf(SDValue &Op) { +static bool isOperandOfExtractHigherHalf(SDValue &Op) { SDNode *OpNode = Op.getNode(); if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return false; @@ -4200,10 +4201,6 @@ return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2; } -static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) { - return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2); -} - static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { EVT VT = N->getValueType(0); @@ -4545,27 +4542,48 @@ } } case Intrinsic::aarch64_neon_pmull64: { - SDValue Op1 = Op.getOperand(1); - SDValue Op2 = Op.getOperand(2); - - // If both operands are higher half of two source SIMD & FP registers, - // ISel could make use of tablegen patterns to emit PMULL2. So do not - // legalize i64 to v1i64. - if (areOperandsOfHigherHalf(Op1, Op2)) - return SDValue(); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + + const bool isLHSHigherHalf = isOperandOfExtractHigherHalf(LHS); + const bool isRHSHigherHalf = isOperandOfExtractHigherHalf(RHS); + + // 'aarch64_neon_pmull64' takes i64 parameters; while pmull instruction + // executes on SIMD instructions. Canonicalize operands to the vector types + // for more efficient code generation. + static auto TryVectorizeOperand = [&dl, &DAG](SDValue N, bool HigherHalf, + bool Dup) -> SDValue { + // If the operand is an higher half itself, canonicalize it to + // extract_high_v2i64. + if (HigherHalf) + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64, + N.getOperand(0), DAG.getConstant(1, dl, MVT::i64)); + + // If this operand is not a higher half but the other operand is, dup it. + // + // FIXME: Use DUPLANE64 when N is an extract of lower half itself. + // Note `Dup` means the other operand is an extract of higher-half, and + // presumably in most cases two operands have the same lane number (or not + // in SIMD registers yet). + if (Dup) + return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N); + + // As a general rule, use v1i64 to represent i64 for pmull64. This helps + // ISel to generate SIMD instructions when applicable; for example, + // generate a SIMD load as opposed to a GPR load followed by a fmov. + if (N.getValueType() == MVT::i64) + N = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N); + + return N; + }; - // As a general convention, use "v1" types to represent scalar integer - // operations in vector registers. This helps ISel to make use of - // tablegen patterns and generate a load into SIMD & FP registers directly. - if (Op1.getValueType() == MVT::i64) - Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1); - if (Op2.getValueType() == MVT::i64) - Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2); + LHS = TryVectorizeOperand(LHS, isLHSHigherHalf, isRHSHigherHalf); + RHS = TryVectorizeOperand(RHS, isRHSHigherHalf, isLHSHigherHalf); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), - DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1, - Op2); + DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), LHS, + RHS); } case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), @@ -15536,6 +15554,15 @@ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); } +static SDValue tryCombinePMULL64Intrinsic(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); +} + // AArch64 high-vector "long" operations are formed by performing the non-high // version on an extract_subvector of each operand which gets the high half: // @@ -16624,6 +16651,10 @@ return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_pmull: + return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_pmull64: + return tryCombinePMULL64Intrinsic(N, DCI, DAG); case Intrinsic::aarch64_neon_sqdmull: return tryCombineLongOpWithDup(IID, N, DCI, DAG); case Intrinsic::aarch64_neon_sqshl: @@ -19707,6 +19738,7 @@ return performUADDVCombine(N, DAG); case AArch64ISD::SMULL: case AArch64ISD::UMULL: + case AArch64ISD::PMULL: return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -117,6 +117,8 @@ ComplexPattern; def extract_high_v4i32 : ComplexPattern; +def extract_high_v2i64 : + ComplexPattern; def extract_high_dup_v8i16 : BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>; @@ -6502,24 +6504,27 @@ } multiclass SIMDDifferentThreeVectorBD opc, string asm, - Intrinsic IntOp> { + SDPatternOperator OpNode = null_frag> { def v8i8 : BaseSIMDDifferentThreeVector; + [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; def v16i8 : BaseSIMDDifferentThreeVector; let Predicates = [HasAES] in { def v1i64 : BaseSIMDDifferentThreeVector; + asm, ".1q", ".1d", ".1d", + [(set (v16i8 V128:$Rd), (OpNode (v1i64 V64:$Rn), (v1i64 V64:$Rm)))]>; def v2i64 : BaseSIMDDifferentThreeVector; + asm#"2", ".1q", ".2d", ".2d", + [(set (v16i8 V128:$Rd), (OpNode (extract_high_v2i64 (v2i64 V128:$Rn)), + (extract_high_v2i64 (v2i64 V128:$Rm))))]>; } - def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), + def : Pat<(v8i16 (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))), (!cast(NAME#"v16i8") V128:$Rn, V128:$Rm)>; } Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -670,6 +670,7 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>; +def AArch64pmull : SDNode<"AArch64ISD::PMULL", SDT_AArch64mull, [SDNPCommutative]>; def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull, [SDNPCommutative]>; def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull, @@ -5218,7 +5219,7 @@ defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>; defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; -defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>; +defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>; defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", AArch64sabd>; defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", @@ -5296,13 +5297,6 @@ defm : Neon_mul_acc_widen_patterns; -// Patterns for 64-bit pmull -def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), - (PMULLv1i64 V64:$Rn, V64:$Rm)>; -def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)), - (extractelt (v2i64 V128:$Rm), (i64 1))), - (PMULLv2i64 V128:$Rn, V128:$Rm)>; - // CodeGen patterns for addhn and subhn instructions, which can actually be // written in LLVM IR without too much difficulty. Index: llvm/test/CodeGen/AArch64/aarch64-pmull2.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-pmull2.ll +++ llvm/test/CodeGen/AArch64/aarch64-pmull2.ll @@ -1,34 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK -; Test that PMULL2 are codegen'ed when only one (of two) operands -; are in higher-half register already. -; -; Codegen is more efficient by getting rid of unnecessary moves across lanes, when user code intends to execute {pmull, pmull2} instruction -; on {lower, higher} half of the same SIMD register. +; User code intends to execute {pmull, pmull2} instructions on {lower, higher} half of the same vector registers directly. +; Test that PMULL2 are generated for higher-half operands. +; The suboptimal code generation fails to use higher-half contents in place; instead, it moves higher-lane contents to lower lane +; to make use of PMULL everywhere, and generates unnecessary moves. define void @test1(ptr %0, ptr %1) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: mov w8, #56824 ; CHECK-NEXT: mov w9, #61186 -; CHECK-NEXT: movk w8, #40522, lsl #16 +; CHECK-NEXT: mov w8, #56824 ; CHECK-NEXT: movk w9, #29710, lsl #16 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: fmov d2, x9 -; CHECK-NEXT: mov x11, v1.d[1] -; CHECK-NEXT: fmov d3, x8 -; CHECK-NEXT: fmov d4, x10 -; CHECK-NEXT: pmull v0.1q, v0.1d, v2.1d -; CHECK-NEXT: fmov d5, x11 -; CHECK-NEXT: pmull v1.1q, v1.1d, v2.1d -; CHECK-NEXT: pmull v2.1q, v4.1d, v3.1d -; CHECK-NEXT: pmull v3.1q, v5.1d, v3.1d -; CHECK-NEXT: ldp q4, q5, [x0] -; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b -; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b -; CHECK-NEXT: eor v1.16b, v5.16b, v1.16b +; CHECK-NEXT: movk w8, #40522, lsl #16 +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: pmull2 v4.1q, v0.2d, v2.2d +; CHECK-NEXT: pmull v0.1q, v0.1d, v3.1d +; CHECK-NEXT: pmull2 v2.1q, v1.2d, v2.2d +; CHECK-NEXT: pmull v1.1q, v1.1d, v3.1d +; CHECK-NEXT: eor v0.16b, v0.16b, v4.16b +; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b ; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %3 = load <2 x i64>, ptr %1 @@ -46,22 +38,23 @@ %15 = extractelement <2 x i64> %5, i64 0 %16 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %15, i64 1947135746) %17 = xor <16 x i8> %14, %7 - %18 = bitcast <16 x i8> %17 to <2 x i64> - %19 = xor <16 x i8> %16, %9 - %20 = bitcast <16 x i8> %19 to <2 x i64> - %21 = xor <2 x i64> %10, %18 - %22 = xor <2 x i64> %12, %20 - store <2 x i64> %21, ptr %1 - store <2 x i64> %22, ptr %4 + %18 = xor <16 x i8> %16, %9 + ;%20 = bitcast <16 x i8> %19 to <2 x i64> + ;%21 = xor <2 x i64> %10, %18 + ;%22 = xor <2 x i64> %12, %20 + store <16 x i8> %17, ptr %1 + store <16 x i8> %18, ptr %4 ret void } +; One operand is higher-half of SIMD register, and the other operand is lower-half of another SIMD register. +; Tests that codegen doesn't generate unnecessary moves. define void @test2(ptr %0, <2 x i64> %1, <2 x i64> %2) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %4 = extractelement <2 x i64> %1, i64 1 Index: llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll =================================================================== --- llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll +++ llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll @@ -1,9 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK -; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are -; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov. - +; Two operands are in scalar form. +; Tests that both operands are loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov. define void @test1(ptr %0, i64 %1, i64 %2) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: @@ -12,7 +11,7 @@ ; CHECK-NEXT: ldr d0, [x8, #8] ; CHECK-NEXT: ldr d1, [x9, #8] ; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d -; CHECK-NEXT: str q0, [x9] +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %4 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 @@ -20,18 +19,19 @@ %7 = getelementptr inbounds <2 x i64>, ptr %0, i64 %2, i64 1 %8 = load i64, ptr %7, align 8 %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 %8) - store <16 x i8> %9, ptr %4, align 16 + store <16 x i8> %9, ptr %0, align 16 ret void } +; Operand %8 is higher-half of v2i64, and operand %7 is a scalar load. +; Tests that operand is loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov. define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, v0.d[1] ; CHECK-NEXT: add x8, x0, x1, lsl #4 -; CHECK-NEXT: ldr d0, [x8, #8] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: add x9, x8, #8 +; CHECK-NEXT: ld1r { v1.2d }, [x9] +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 @@ -43,30 +43,11 @@ ret void } -; test3 clones test2, but swaps lhs with rhs, to test that non-extract -; operand will be canonicalized to the rhs. -define void @test3(ptr %0, i64 %1, i64 %2, <2 x i64> %3) { +; Operand %7 is a scalar load, and operand %3 is an input parameter of function `test4`. +; Test that %7 is loaded into SIMD registers. +define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: add x8, x0, x1, lsl #4 -; CHECK-NEXT: ldr d0, [x8, #8] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d -; CHECK-NEXT: str q0, [x8] -; CHECK-NEXT: ret - %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1 - %6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1 - %7 = load i64, ptr %6, align 8 - %8 = extractelement <2 x i64> %3, i64 1 - %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %8) - store <16 x i8> %9, ptr %5, align 16 - ret void -} - -define void @test4(ptr %0, i64 %1, i64 %2, i64 %3) { -; CHECK-LABEL: test4: -; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x0, x1, lsl #4 ; CHECK-NEXT: fmov d0, x3 ; CHECK-NEXT: ldr d1, [x8, #8] @@ -81,13 +62,13 @@ ret void } -define void @test5(ptr %0, <2 x i64> %1, i64 %2) { -; CHECK-LABEL: test5: +; Operand %4 is the higher-half of v2i64, and operand %2 is an input parameter of i64. +; Test that %2 is duplicated into the proper lane of SIMD directly for optimal codegen. +define void @test4(ptr %0, <2 x i64> %1, i64 %2) { +; CHECK-LABEL: test4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d +; CHECK-NEXT: dup v1.2d, x1 +; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %4 = extractelement <2 x i64> %1, i64 1