Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -291,6 +291,8 @@
   SMULL,
   UMULL,
 
+  PMULL,
+
   // Reciprocal estimates and steps.
   FRECPE,
   FRECPS,
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2243,6 +2243,7 @@
     MAKE_CASE(AArch64ISD::ST4LANEpost)
     MAKE_CASE(AArch64ISD::SMULL)
     MAKE_CASE(AArch64ISD::UMULL)
+    MAKE_CASE(AArch64ISD::PMULL)
     MAKE_CASE(AArch64ISD::FRECPE)
     MAKE_CASE(AArch64ISD::FRECPS)
     MAKE_CASE(AArch64ISD::FRSQRTE)
@@ -4186,7 +4187,7 @@
   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
 }
 
-static bool isOperandOfHigherHalf(SDValue &Op) {
+static bool isOperandOfExtractHigherHalf(SDValue &Op) {
   SDNode *OpNode = Op.getNode();
   if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
     return false;
@@ -4200,10 +4201,6 @@
   return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2;
 }
 
-static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) {
-  return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2);
-}
-
 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
                                    bool isSigned) {
   EVT VT = N->getValueType(0);
@@ -4545,27 +4542,48 @@
     }
   }
   case Intrinsic::aarch64_neon_pmull64: {
-    SDValue Op1 = Op.getOperand(1);
-    SDValue Op2 = Op.getOperand(2);
-
-    // If both operands are higher half of two source SIMD & FP registers,
-    // ISel could make use of tablegen patterns to emit PMULL2. So do not
-    // legalize i64 to v1i64.
-    if (areOperandsOfHigherHalf(Op1, Op2))
-      return SDValue();
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+
+    const bool isLHSHigherHalf = isOperandOfExtractHigherHalf(LHS);
+    const bool isRHSHigherHalf = isOperandOfExtractHigherHalf(RHS);
+
+    // 'aarch64_neon_pmull64' takes i64 parameters; while pmull instruction
+    // executes on SIMD instructions. Canonicalize operands to the vector types
+    // for more efficient code generation.
+    static auto TryVectorizeOperand = [&dl, &DAG](SDValue N, bool HigherHalf,
+                                                  bool Dup) -> SDValue {
+      // If the operand is an higher half itself, canonicalize it to
+      // extract_high_v2i64.
+      if (HigherHalf)
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
+                           N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
+
+      // If this operand is not a higher half but the other operand is, dup it.
+      //
+      // FIXME: Use DUPLANE64 when N is an extract of lower half itself.
+      // Note `Dup` means the other operand is an extract of higher-half, and
+      // presumably in most cases two operands have the same lane number (or not
+      // in SIMD registers yet).
+      if (Dup)
+        return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
+
+      // As a general rule, use v1i64 to represent i64 for pmull64. This helps
+      // ISel to generate SIMD instructions when applicable; for example,
+      // generate a SIMD load as opposed to a GPR load followed by a fmov.
+      if (N.getValueType() == MVT::i64)
+        N = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
+
+      return N;
+    };
 
-    // As a general convention, use "v1" types to represent scalar integer
-    // operations in vector registers. This helps ISel to make use of
-    // tablegen patterns and generate a load into SIMD & FP registers directly.
-    if (Op1.getValueType() == MVT::i64)
-      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1);
-    if (Op2.getValueType() == MVT::i64)
-      Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2);
+    LHS = TryVectorizeOperand(LHS, isLHSHigherHalf, isRHSHigherHalf);
+    RHS = TryVectorizeOperand(RHS, isRHSHigherHalf, isLHSHigherHalf);
 
     return DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
-        DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1,
-        Op2);
+        DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), LHS,
+        RHS);
   }
   case Intrinsic::aarch64_neon_smax:
     return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
@@ -15536,6 +15554,15 @@
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
 }
 
+static SDValue tryCombinePMULL64Intrinsic(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+  return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
+                     N->getOperand(1), N->getOperand(2));
+}
+
 // AArch64 high-vector "long" operations are formed by performing the non-high
 // version on an extract_subvector of each operand which gets the high half:
 //
@@ -16624,6 +16651,10 @@
     return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_pmull:
+    return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_pmull64:
+    return tryCombinePMULL64Intrinsic(N, DCI, DAG);
   case Intrinsic::aarch64_neon_sqdmull:
     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
   case Intrinsic::aarch64_neon_sqshl:
@@ -19707,6 +19738,7 @@
     return performUADDVCombine(N, DAG);
   case AArch64ISD::SMULL:
   case AArch64ISD::UMULL:
+  case AArch64ISD::PMULL:
     return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -117,6 +117,8 @@
     ComplexPattern<v4i16, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
 def extract_high_v4i32 :
     ComplexPattern<v2i32, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
+def extract_high_v2i64 :
+    ComplexPattern<v1i64, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
 
 def extract_high_dup_v8i16 :
    BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>;
@@ -6502,24 +6504,27 @@
 }
 
 multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
-                                      Intrinsic IntOp> {
+                                      SDPatternOperator OpNode = null_frag> {
   def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
                                             V128, V64, V64,
                                             asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
   def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
                                             V128, V128, V128,
                                             asm#"2", ".8h", ".16b", ".16b", []>;
   let Predicates = [HasAES] in {
     def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
                                               V128, V64, V64,
-                                              asm, ".1q", ".1d", ".1d", []>;
+                                              asm, ".1q", ".1d", ".1d",
+        [(set (v16i8 V128:$Rd), (OpNode (v1i64 V64:$Rn), (v1i64 V64:$Rm)))]>;
     def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
                                               V128, V128, V128,
-                                              asm#"2", ".1q", ".2d", ".2d", []>;
+                                              asm#"2", ".1q", ".2d", ".2d",
+        [(set (v16i8 V128:$Rd), (OpNode (extract_high_v2i64 (v2i64 V128:$Rn)),
+                                        (extract_high_v2i64 (v2i64 V128:$Rm))))]>;
   }
 
-  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
+  def : Pat<(v8i16 (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
                           (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))),
       (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
 }
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -670,6 +670,7 @@
 
 def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
                                     SDTCisSameAs<1, 2>]>;
+def AArch64pmull    : SDNode<"AArch64ISD::PMULL", SDT_AArch64mull, [SDNPCommutative]>;
 def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull,
                              [SDNPCommutative]>;
 def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull,
@@ -5218,7 +5219,7 @@
 defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
 defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
 defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
-defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
+defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
 defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
                                              AArch64sabd>;
 defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
@@ -5296,13 +5297,6 @@
 defm : Neon_mul_acc_widen_patterns<sub, AArch64smull,
      SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
 
-// Patterns for 64-bit pmull
-def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
-          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
-def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
-                                    (extractelt (v2i64 V128:$Rm), (i64 1))),
-          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
-
 // CodeGen patterns for addhn and subhn instructions, which can actually be
 // written in LLVM IR without too much difficulty.
 
Index: llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
===================================================================
--- llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
+++ llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
@@ -1,34 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs  -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK
 
-; Test that PMULL2 are codegen'ed when only one (of two) operands
-; are in higher-half register already.
-;
-; Codegen is more efficient by getting rid of unnecessary moves across lanes, when user code intends to execute {pmull, pmull2} instruction
-; on {lower, higher} half of the same SIMD register.
+; User code intends to execute {pmull, pmull2} instructions on {lower, higher} half of the same vector registers directly.
+; Test that PMULL2 are generated for higher-half operands.
+; The suboptimal code generation fails to use higher-half contents in place; instead, it moves higher-lane contents to lower lane
+; to make use of PMULL everywhere, and generates unnecessary moves.
 define void @test1(ptr %0, ptr %1) {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    mov w8, #56824
 ; CHECK-NEXT:    mov w9, #61186
-; CHECK-NEXT:    movk w8, #40522, lsl #16
+; CHECK-NEXT:    mov w8, #56824
 ; CHECK-NEXT:    movk w9, #29710, lsl #16
-; CHECK-NEXT:    mov x10, v0.d[1]
-; CHECK-NEXT:    fmov d2, x9
-; CHECK-NEXT:    mov x11, v1.d[1]
-; CHECK-NEXT:    fmov d3, x8
-; CHECK-NEXT:    fmov d4, x10
-; CHECK-NEXT:    pmull v0.1q, v0.1d, v2.1d
-; CHECK-NEXT:    fmov d5, x11
-; CHECK-NEXT:    pmull v1.1q, v1.1d, v2.1d
-; CHECK-NEXT:    pmull v2.1q, v4.1d, v3.1d
-; CHECK-NEXT:    pmull v3.1q, v5.1d, v3.1d
-; CHECK-NEXT:    ldp q4, q5, [x0]
-; CHECK-NEXT:    eor v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    eor v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    movk w8, #40522, lsl #16
+; CHECK-NEXT:    ldp q0, q1, [x1]
+; CHECK-NEXT:    fmov d3, x9
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    pmull2 v4.1q, v0.2d, v2.2d
+; CHECK-NEXT:    pmull v0.1q, v0.1d, v3.1d
+; CHECK-NEXT:    pmull2 v2.1q, v1.2d, v2.2d
+; CHECK-NEXT:    pmull v1.1q, v1.1d, v3.1d
+; CHECK-NEXT:    eor v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
   %3 = load <2 x i64>, ptr %1
@@ -46,22 +38,23 @@
   %15 = extractelement <2 x i64> %5, i64 0
   %16 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %15, i64 1947135746)
   %17 = xor <16 x i8> %14, %7
-  %18 = bitcast <16 x i8> %17 to <2 x i64>
-  %19 = xor <16 x i8> %16, %9
-  %20 = bitcast <16 x i8> %19 to <2 x i64>
-  %21 = xor <2 x i64> %10, %18
-  %22 = xor <2 x i64> %12, %20
-  store <2 x i64> %21, ptr %1
-  store <2 x i64> %22, ptr %4
+  %18 = xor <16 x i8> %16, %9
+  ;%20 = bitcast <16 x i8> %19 to <2 x i64>
+  ;%21 = xor <2 x i64> %10, %18
+  ;%22 = xor <2 x i64> %12, %20
+  store <16 x i8> %17, ptr %1
+  store <16 x i8> %18, ptr %4
   ret void
 }
 
+; One operand is higher-half of SIMD register, and the other operand is lower-half of another SIMD register.
+; Tests that codegen doesn't generate unnecessary moves.
 define void @test2(ptr %0, <2 x i64> %1, <2 x i64> %2) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %4 = extractelement <2 x i64> %1, i64 1
Index: llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
===================================================================
--- llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
+++ llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs  -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK
 
-; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are
-; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov.
-
+; Two operands are in scalar form.
+; Tests that both operands are loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov.
 define void @test1(ptr %0, i64 %1, i64 %2) {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0:
@@ -12,7 +11,7 @@
 ; CHECK-NEXT:    ldr d0, [x8, #8]
 ; CHECK-NEXT:    ldr d1, [x9, #8]
 ; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    str q0, [x9]
+; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %4 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
   %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
@@ -20,18 +19,19 @@
   %7 = getelementptr inbounds <2 x i64>, ptr %0, i64 %2, i64 1
   %8 = load i64, ptr %7, align 8
   %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 %8)
-  store <16 x i8> %9, ptr %4, align 16
+  store <16 x i8> %9, ptr %0, align 16
   ret void
 }
 
+; Operand %8 is higher-half of v2i64, and operand %7 is a scalar load.
+; Tests that operand is loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov.
 define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, v0.d[1]
 ; CHECK-NEXT:    add x8, x0, x1, lsl #4
-; CHECK-NEXT:    ldr d0, [x8, #8]
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-NEXT:    add x9, x8, #8
+; CHECK-NEXT:    ld1r { v1.2d }, [x9]
+; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    ret
   %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
@@ -43,30 +43,11 @@
   ret void
 }
 
-; test3 clones test2, but swaps lhs with rhs, to test that non-extract
-; operand will be canonicalized to the rhs.
-define void @test3(ptr %0, i64 %1, i64 %2, <2 x i64> %3) {
+; Operand %7 is a scalar load, and operand %3 is an input parameter of function `test4`.
+; Test that %7 is loaded into SIMD registers.
+define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) {
 ; CHECK-LABEL: test3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, v0.d[1]
-; CHECK-NEXT:    add x8, x0, x1, lsl #4
-; CHECK-NEXT:    ldr d0, [x8, #8]
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    pmull v0.1q, v0.1d, v1.1d
-; CHECK-NEXT:    str q0, [x8]
-; CHECK-NEXT:    ret
-  %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
-  %6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
-  %7 = load i64, ptr %6, align 8
-  %8 = extractelement <2 x i64> %3, i64 1
-  %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %8)
-  store <16 x i8> %9, ptr %5, align 16
-  ret void
-}
-
-define void @test4(ptr %0, i64 %1, i64 %2, i64 %3) {
-; CHECK-LABEL: test4:
-; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add x8, x0, x1, lsl #4
 ; CHECK-NEXT:    fmov d0, x3
 ; CHECK-NEXT:    ldr d1, [x8, #8]
@@ -81,13 +62,13 @@
   ret void
 }
 
-define void @test5(ptr %0, <2 x i64> %1, i64 %2) {
-; CHECK-LABEL: test5:
+; Operand %4 is the higher-half of v2i64, and operand %2 is an input parameter of i64.
+; Test that %2 is duplicated into the proper lane of SIMD directly for optimal codegen.
+define void @test4(ptr %0, <2 x i64> %1, i64 %2) {
+; CHECK-LABEL: test4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-NEXT:    dup v1.2d, x1
+; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %4 = extractelement <2 x i64> %1, i64 1