Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4186,7 +4186,7 @@
   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
 }
 
-static bool isOperandOfHigherHalf(SDValue &Op) {
+static bool isOperandOfExtractHigherHalf(SDValue &Op) {
   SDNode *OpNode = Op.getNode();
   if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
     return false;
@@ -4200,10 +4200,6 @@
   return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2;
 }
 
-static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) {
-  return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2);
-}
-
 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
                                    bool isSigned) {
   EVT VT = N->getValueType(0);
@@ -4545,27 +4541,52 @@
     }
   }
   case Intrinsic::aarch64_neon_pmull64: {
-    SDValue Op1 = Op.getOperand(1);
-    SDValue Op2 = Op.getOperand(2);
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+
+    bool isLHSHigherHalf = isOperandOfExtractHigherHalf(LHS);
+    bool isRHSHigherHalf = isOperandOfExtractHigherHalf(RHS);
 
-    // If both operands are higher half of two source SIMD & FP registers,
-    // ISel could make use of tablegen patterns to emit PMULL2. So do not
-    // legalize i64 to v1i64.
-    if (areOperandsOfHigherHalf(Op1, Op2))
+    // When both operands are higher half of source registers, ISel could make
+    // use of the following pattern to use PMULL2 directly.
+    //
+    // def : Pat<(int_aarch64_neon_pmull64
+    //              (extractelt (v2i64 V128:$Rn), (i64 1)),
+    //             (extractelt (v2i64 V128:$Rm), (i64 1))),
+    //           (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+    if (isLHSHigherHalf && isRHSHigherHalf)
       return SDValue();
 
-    // As a general convention, use "v1" types to represent scalar integer
-    // operations in vector registers. This helps ISel to make use of
-    // tablegen patterns and generate a load into SIMD & FP registers directly.
-    if (Op1.getValueType() == MVT::i64)
-      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1);
-    if (Op2.getValueType() == MVT::i64)
-      Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2);
+    // Intrinsic aarch64_neon_pmull64 is communative.
+    // If there is exactly one operand that extracts higher half of the vector,
+    // canonicalize it to the left.
+    if (isRHSHigherHalf && !isLHSHigherHalf) {
+      std::swap(LHS, RHS);
+      std::swap(isLHSHigherHalf, isRHSHigherHalf);
+    }
+
+    // As a general convention, vectorize scalar operands. This helps ISel to
+    // make use of tablegen patterns (for example, generate a load into SIMD &
+    // FP registers directly).
+    if (isLHSHigherHalf) {
+      assert(!isRHSHigherHalf && "Expect only one operand to be higher half");
+      RHS = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+          DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
+                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, RHS),
+                      DAG.getConstant(0, dl, MVT::i64)),
+          DAG.getConstant(1, dl, MVT::i64));
+    } else {
+      if (LHS.getValueType() == MVT::i64)
+        LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, LHS);
+      if (RHS.getValueType() == MVT::i64)
+        RHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, RHS);
+    }
 
     return DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
-        DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1,
-        Op2);
+        DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), LHS,
+        RHS);
   }
   case Intrinsic::aarch64_neon_smax:
     return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5598,6 +5598,10 @@
 def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
           (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
 
+
+def : Pat<(v2i64 (AArch64duplane64 (v2i64 (scalar_to_vector (i64 GPR64:$Rn))), (i64 0))),
+          (v2i64 (DUPv2i64gpr GPR64:$Rn))>;
+
 // If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
 // instruction even if the types don't match: we just have to remap the lane
 // carefully. N.b. this trick only applies to truncations.
Index: llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
===================================================================
--- llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
+++ llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
@@ -1,33 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs  -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK
 
-; Test that PMULL2 are codegen'ed when only one (of two) operands
-; are in higher-half register already.
-;
-; Codegen is more efficient by getting rid of unnecessary moves across lanes, when user code intends to execute {pmull, pmull2} instruction
-; on {lower, higher} half of the same SIMD register.
+; User code intends to execute {pmull, pmull2} instructions on {lower, higher} half of the same vector registers directly.
+; Test that PMULL2 are generated for higher-half operands.
+; The suboptimal code generation fails to use higher-half contents in place; instead, it moves higher-lane contents to lower lane
+; to make use of PMULL everywhere, and generates unnecessary moves.
 define void @test1(ptr %0, ptr %1) {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    mov w8, #56824
 ; CHECK-NEXT:    mov w9, #61186
-; CHECK-NEXT:    movk w8, #40522, lsl #16
+; CHECK-NEXT:    mov w8, #56824
 ; CHECK-NEXT:    movk w9, #29710, lsl #16
-; CHECK-NEXT:    mov x10, v0.d[1]
-; CHECK-NEXT:    fmov d2, x9
-; CHECK-NEXT:    mov x11, v1.d[1]
-; CHECK-NEXT:    fmov d3, x8
-; CHECK-NEXT:    fmov d4, x10
-; CHECK-NEXT:    pmull v0.1q, v0.1d, v2.1d
-; CHECK-NEXT:    fmov d5, x11
-; CHECK-NEXT:    pmull v1.1q, v1.1d, v2.1d
-; CHECK-NEXT:    pmull v2.1q, v4.1d, v3.1d
-; CHECK-NEXT:    pmull v3.1q, v5.1d, v3.1d
-; CHECK-NEXT:    ldp q4, q5, [x0]
-; CHECK-NEXT:    eor v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    movk w8, #40522, lsl #16
+; CHECK-NEXT:    ldp q0, q2, [x1]
+; CHECK-NEXT:    fmov d3, x9
+; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    pmull2 v4.1q, v0.2d, v1.2d
+; CHECK-NEXT:    pmull v0.1q, v0.1d, v3.1d
+; CHECK-NEXT:    pmull2 v1.1q, v2.2d, v1.2d
+; CHECK-NEXT:    pmull v2.1q, v2.1d, v3.1d
+; CHECK-NEXT:    ldp q3, q5, [x0]
+; CHECK-NEXT:    eor v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    eor v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
 ; CHECK-NEXT:    eor v1.16b, v5.16b, v1.16b
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
@@ -56,12 +51,13 @@
   ret void
 }
 
+; One operand is higher-half of SIMD register, and the other operand is lower-half of another SIMD register.
+; Tests that codegen doesn't generate unnecessary moves.
 define void @test2(ptr %0, <2 x i64> %1, <2 x i64> %2) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-NEXT:    dup v1.2d, v1.d[0]
+; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %4 = extractelement <2 x i64> %1, i64 1
Index: llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
===================================================================
--- llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
+++ llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs  -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK
 
-; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are
-; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov.
-
+; Two operands are in scalar form.
+; Tests that both operands are loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov.
 define void @test1(ptr %0, i64 %1, i64 %2) {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0:
@@ -24,14 +23,15 @@
   ret void
 }
 
+; Operand %8 is higher-half of v2i64, and operand %7 is a scalar load.
+; Tests that operand is loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov.
 define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, v0.d[1]
 ; CHECK-NEXT:    add x8, x0, x1, lsl #4
-; CHECK-NEXT:    ldr d0, [x8, #8]
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-NEXT:    ldr x9, [x8, #8]
+; CHECK-NEXT:    dup v1.2d, x9
+; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    ret
   %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
@@ -48,11 +48,10 @@
 define void @test3(ptr %0, i64 %1, i64 %2, <2 x i64> %3) {
 ; CHECK-LABEL: test3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, v0.d[1]
 ; CHECK-NEXT:    add x8, x0, x1, lsl #4
-; CHECK-NEXT:    ldr d0, [x8, #8]
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-NEXT:    ldr x9, [x8, #8]
+; CHECK-NEXT:    dup v1.2d, x9
+; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    ret
   %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
@@ -64,6 +63,8 @@
   ret void
 }
 
+; Operand %7 is a scalar load, and operand %3 is an input parameter of `test4`.
+; Test that %7 is loaded into SIMD registers, and %3 is fmov'ed, for optimal codegen.
 define void @test4(ptr %0, i64 %1, i64 %2, i64 %3) {
 ; CHECK-LABEL: test4:
 ; CHECK:       // %bb.0:
@@ -81,13 +82,13 @@
   ret void
 }
 
+; Operand %4 is the higher-half of v2i64, and operand %2 is an input parameter of i64.
+; Test that %2 is duplicated into the proper lane of SIMD directly for optimal codegen.
 define void @test5(ptr %0, <2 x i64> %1, i64 %2) {
 ; CHECK-LABEL: test5:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-NEXT:    dup v1.2d, x1
+; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %4 = extractelement <2 x i64> %1, i64 1