Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -7384,9 +7384,206 @@
   return false;
 }
 
+/// Generates a v2i64 multiply routine for NEON.
+/// Uses one of two versions of the 64-bit multiply algorithm.
+/// Multiplications take a maximum of 11 cycles, possibly fewer depending on
+/// whether an operand is constant or interleaved beforehand.
+/// See the implementation for details.
+static SDValue LowerNEONv2i64MUL(SDValue &Op, SelectionDAG &DAG,
+                                 const EVT &VT) {
+  // The simplest way to efficiently do a v2i64 multiply on NEON is to use the
+  // exact same method as used for SSE, to be referred to as "ssemul":
+  //    vshrn.i64  topHi, top, #32      @ v2i32 topHi = top >> 32;
+  //    vmovn.i64  topLo, top           @ v2i32 topLo = top & 0xFFFFFFFF;
+  //    vshrn.i64  botHi, bot, #32      @ v2i32 botHi = bot >> 32;
+  //    vmovn.i64  botLo, bot           @ v2i32 botLo = bot & 0xFFFFFFFF;
+  //    vmull.u32  ret64, topHi, botLo  @ v2i64 ret64 = (v2i64)topHi * botLo;
+  //    vmlal.u64  ret64, topLo, botHi  @ ret64 += (v2i64)topLo * botHi;
+  //    vshl.i64   ret64, ret64, #32    @ ret64 <<= 32;
+  //    vmlal.u32  ret64, topLo, botLo  @ ret64 += (v2i64)topLo * botLo;
+  // However, a second method, "twomul", is four bytes shorter with the same
+  // timing (11 cycles).
+  //    vmovn.i64  topLo, top           @ v2i64 topLo = top & 0xFFFFFFFF;
+  //    vmovn.i64  botLo, bot           @ v2i64 botLo = bot & 0xFFFFFFFF;
+  //                                    @ v4i32 bot32 = (v4i32)bot;
+  //    vrev64.32  botRe, bot           @ v4i32 botRe = (v4i32) {
+  //                                    @     bot32[1], bot32[0],
+  //                                    @     bot32[3], bot32[2]
+  //                                    @ };
+  //    vmul.i32   botRe, botRe, top    @ botRe *= (v4i32)top;
+  //    vpaddl.u32 botRe, botRe         @ top = (v2i64) {
+  //                                    @     (u64)botRe[0] + botRe[1],
+  //                                    @     (u64)botRe[2] + botRe[3]
+  //                                    @ }
+  //    vshl.i64   top,   botRe, #32    @ top <<= 32;
+  //    vmlal.u32  top,   topLo, botLo  @ top += (v2i64)topLo * botLo;
+  // However, ssemul can be simplified a lot, in which it becomes the better
+  // option. This can be in one or more of the following ways:
+  //   - The high or low bits of an operand are determined to be zero.
+  //   - One or more operands can be interleaved into high and low bits
+  //     beforehand i.e. changing a vld1q.64 to vld2.32 or preswapping a
+  //     constant. This makes it so we don't have to waste two cycles on
+  //     vshrn/vmovn. While pointer loading has yet to be implemented,
+  //     constant swapping is mostly complete.
+  //     In the case of the interleaved version, it would look like this:
+  //        vld2.32    {topLo,topHi} [topPtr]
+  //        vld2.32    {botLo,botHi} [botPtr]
+  //        vmull.u32  ret64, topHi, botLo
+  //        vmlal.u64  ret64, topLo, botHi
+  //        vshl.i64   ret64, ret64, #32
+  //        vmlal.u32  ret64, topLo, botLo
+  //     This optimization only works if the pointer hasn't been loaded and is
+  //     only used for the multiply, as swapping back removes the benefit.
+  SDLoc DL(Op);
+  SDValue Top = Op.getOperand(0);
+  SDValue Bot = Op.getOperand(1);
+
+  APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
+  APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
+
+  bool TopLoIsZero, BotLoIsZero, TopHiIsZero, BotHiIsZero;
+  bool TopIsInterleaved = false, BotIsInterleaved = false;
+  std::pair<SDValue, SDValue> TopPair, BotPair;
+  // If we have a constant operand, and we only use that operand once,
+  // we can interleave the bits at compile time to avoid vshrn and vmovn.
+  // The goal is to have a 0, 2, 1, 3 shuffle of the multiple reinterpreted
+  // as a v4i32.
+  // TODO: Make this use one load instead of two.
+  // Currently, this saves one cycle, but it can save two if we can
+  // combine the loads together.
+  // TODO: Detect if we use a constant multiple times, but only for
+  //multiplication.
+  if (ISD::isBuildVectorOfConstantSDNodes(Top.getOperand(0).getNode())
+      && Top.getOperand(0).hasOneUse()) {
+    // Bitcast to v4i32.
+    SDValue TopRaw = DAG.getBitcast(MVT::v4i32, Top.getOperand(0));
+    SmallVector<uint32_t, 4> RawValues;
+    // Gather up all of the values.
+    for (int i = 0; i < 4; i++) {
+      RawValues.push_back(uint32_t(TopRaw.getConstantOperandVal(i)));
+    }
+    // Might as well do the zero calculations while we have easy integers.
+    TopLoIsZero = (RawValues[0] == 0 && RawValues[2] == 0);
+    TopHiIsZero = (RawValues[1] == 0 && RawValues[3] == 0);
+
+    SmallVector<SDValue, 2> Reordered;
+    // 0, 2, 1, 3 puts the low bits first and the high bits last.
+    Reordered.push_back(DAG.getConstant(RawValues[0], DL, MVT::i32));
+    Reordered.push_back(DAG.getConstant(RawValues[2], DL, MVT::i32));
+    Reordered.push_back(DAG.getConstant(RawValues[1], DL, MVT::i32));
+    Reordered.push_back(DAG.getConstant(RawValues[3], DL, MVT::i32));
+    // Create the vector
+    SDValue TopInterleaved = DAG.getBuildVector(MVT::v4i32, DL, Reordered);
+    // Split it into two halves, topLo and topHi.
+    TopPair = DAG.SplitVector(TopInterleaved, DL, MVT::v2i32, MVT::v2i32);
+    // Mark that we don't need to separate these later.
+    TopIsInterleaved = true;
+  } else {
+    KnownBits TopKnown = DAG.computeKnownBits(Top);
+    TopLoIsZero = LowerBitsMask.isSubsetOf(TopKnown.Zero);
+    TopHiIsZero = UpperBitsMask.isSubsetOf(TopKnown.Zero);
+  }
+
+  if (ISD::isBuildVectorOfConstantSDNodes(Bot.getOperand(0).getNode())
+      && Bot.getOperand(0).hasOneUse()) {
+    SDValue BotRaw = DAG.getBitcast(MVT::v4i32, Bot.getOperand(0));
+    SmallVector<uint32_t, 4> RawValues;
+    for (int i = 0; i < 4; i++) {
+      RawValues.push_back(uint32_t(BotRaw.getConstantOperandVal(i)));
+    }
+    BotLoIsZero = (RawValues[0] == 0 && RawValues[2] == 0);
+    BotHiIsZero = (RawValues[1] == 0 && RawValues[3] == 0);
+    SmallVector<SDValue, 2> Reordered;
+    // 0, 2, 1, 3 puts the low bits first and the high bits last.
+    Reordered.push_back(DAG.getConstant(RawValues[0], DL, MVT::i32));
+    Reordered.push_back(DAG.getConstant(RawValues[2], DL, MVT::i32));
+    Reordered.push_back(DAG.getConstant(RawValues[1], DL, MVT::i32));
+    Reordered.push_back(DAG.getConstant(RawValues[3], DL, MVT::i32));
+    SDValue BotInterleaved = DAG.getBuildVector(MVT::v4i32, DL, Reordered);
+    BotPair = DAG.SplitVector(BotInterleaved, DL, MVT::v2i32, MVT::v2i32);
+    BotIsInterleaved = true;
+  } else {
+    KnownBits BotKnown = DAG.computeKnownBits(Bot);
+    BotLoIsZero = LowerBitsMask.isSubsetOf(BotKnown.Zero);
+    BotHiIsZero = UpperBitsMask.isSubsetOf(BotKnown.Zero);
+  }
+
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue C32 = DAG.getConstant(32, DL, MVT::i32);
+
+  // Either way, we need the low bits and TopLo * BotLo.
+  // vmovn.i64
+  SDValue TopLo =
+      TopIsInterleaved ? TopPair.first
+                       : DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, Top);
+  // vmovn.i64
+  SDValue BotLo =
+      BotIsInterleaved ? BotPair.first
+                       : DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, Bot);
+
+
+  SDValue TopLoBotLo = Zero;
+  if (!TopLoIsZero && !BotLoIsZero)
+    // vmull.u32
+    TopLoBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, TopLo, BotLo);
+
+  // Don't go any further if we are only multiplying low bits.
+  // This is needed until further optimizations can be made.
+  if (TopHiIsZero && BotHiIsZero)
+    return TopLoBotLo;
+
+  // The following block is the twomul routine. We only use it when we
+  // can't tell if any of the words are zero, and we are not interleaving.
+  if (!TopIsInterleaved && !BotIsInterleaved
+      && !TopLoIsZero && !BotLoIsZero && !TopHiIsZero && !BotHiIsZero) {
+    Bot = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, Bot);
+    // vrev64.32
+    SDValue BotRev = DAG.getNode(ARMISD::VREV64, DL, MVT::v4i32, Bot);
+    Top = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, Top);
+    // vmul.i32
+    SDValue CrossMul = DAG.getNode(ISD::MUL, DL, MVT::v4i32, BotRev, Top);
+    // vpaddl.u32
+    SDValue Merged = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v2i64,
+             DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, MVT::i32),
+             CrossMul);
+    // vshl.i64
+    SDValue Shifted = DAG.getNode(ARMISD::VSHL, DL, VT, Merged, C32);
+    return DAG.getNode(ISD::ADD, DL, VT, Shifted, TopLoBotLo);
+  }
+
+  // This is the ssemul routine.
+  SDValue TopLoBotHi = Zero;
+  if (!TopLoIsZero && !BotHiIsZero) {
+    // vshrn.i64
+    SDValue BotHi =
+        BotIsInterleaved ? BotPair.second
+                         : DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32,
+                                  DAG.getNode(ARMISD::VSHRu, DL, VT, Bot, C32));
+    // vmlal.u32 (merges with add below)
+    TopLoBotHi = DAG.getNode(ARMISD::VMULLu, DL, VT, TopLo, BotHi);
+  }
+  SDValue TopHiBotLo = Zero;
+  if (!TopHiIsZero && !BotLoIsZero) {
+    // vshrn.i64
+    SDValue TopHi =
+        TopIsInterleaved ? TopPair.second
+                         : DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32,
+                                  DAG.getNode(ARMISD::VSHRu, DL, VT, Top, C32));
+    // vmlal.u32 (merges with add below)
+    TopHiBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, TopHi, BotLo);
+  }
+
+  // (optimized to vmlal.u32)
+  SDValue High = DAG.getNode(ISD::ADD, DL, VT, TopLoBotHi, TopHiBotLo);
+  // vshl.i64
+  SDValue Shifted = DAG.getNode(ARMISD::VSHL, DL, VT, High, C32);
+  // (optimized to vmlal.u32)
+  return DAG.getNode(ISD::ADD, DL, VT, TopLoBotLo, Shifted);
+}
+
 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   // Multiplications are only custom-lowered for 128-bit vectors so that
-  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  // VMULL can be detected.
   EVT VT = Op.getValueType();
   assert(VT.is128BitVector() && VT.isInteger() &&
          "unexpected type for custom-lowering ISD::MUL");
@@ -7420,12 +7617,11 @@
     }
 
     if (!NewOpc) {
-      if (VT == MVT::v2i64)
-        // Fall through to expand this.  It is not legal.
-        return SDValue();
-      else
-        // Other vector multiplications are legal.
-        return Op;
+      if (VT == MVT::v2i64) {
+        return LowerNEONv2i64MUL(Op, DAG, VT);
+      }
+      // Other vector multiplications are legal.
+      return Op;
     }
   }
 
Index: lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -487,7 +487,7 @@
   static const CostTblEntry CostTbl[] = {
     // Division.
     // These costs are somewhat random. Choose a cost of 20 to indicate that
-    // vectorizing devision (added function call) is going to be very expensive.
+    // vectorizing division (added function call) is going to be very expensive.
     // Double registers types.
     { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
     { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
@@ -522,12 +522,19 @@
     { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
     { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
     { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
-    // Multiplication.
+    // Multiplication. Not sure what exact value to use here, but the main thing
+    // is that it is cheaper to multiply if we are already in a vector.
+    { ISD::MUL, MVT::v2i64, 8 },
   };
 
-  if (ST->hasNEON())
+  if (ST->hasNEON()) {
+    // Multiply by constant is a little cheaper.
+    if (ISDOpcode == ISD::MUL && LT.second == MVT::v2i64
+        && Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+      return LT.first * 7;
     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
       return LT.first * Entry->Cost;
+  }
 
   int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
                                            Opd1PropInfo, Opd2PropInfo);
Index: test/Analysis/CostModel/ARM/mult.ll
===================================================================
--- test/Analysis/CostModel/ARM/mult.ll
+++ test/Analysis/CostModel/ARM/mult.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 | FileCheck %s
+
+define <2 x i8> @mul_v2_i8(<2 x i8>  %a, <2 x i8> %b) {
+; CHECK-LABEL: 'mul_v2_i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <2 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %1
+;
+
+  %1 = mul <2 x i8>  %a, %b
+  ret <2 x i8> %1
+}
+define <2 x i16> @mul_v2_i16(<2 x i16>  %a, <2 x i16> %b) {
+; CHECK-LABEL: 'mul_v2_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <2 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %1
+;
+
+  %1 = mul <2 x i16>  %a, %b
+  ret <2 x i16> %1
+}
+define <2 x i32> @mul_v2_i32(<2 x i32>  %a, <2 x i32> %b) {
+; CHECK-LABEL: 'mul_v2_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <2 x i32> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
+;
+
+  %1 = mul <2 x i32>  %a, %b
+  ret <2 x i32> %1
+}
+define <2 x i64> @mul_v2_i64(<2 x i64>  %a, <2 x i64> %b) {
+; CHECK-LABEL: 'mul_v2_i64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %1 = mul <2 x i64> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %1
+;
+
+  %1 = mul <2 x i64>  %a, %b
+  ret <2 x i64> %1
+}
+define <4 x i8> @mul_v4_i8(<4 x i8>  %a, <4 x i8> %b) {
+; CHECK-LABEL: 'mul_v4_i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <4 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %1
+;
+
+  %1 = mul <4 x i8>  %a, %b
+  ret <4 x i8> %1
+}
+define <4 x i16> @mul_v4_i16(<4 x i16>  %a, <4 x i16> %b) {
+; CHECK-LABEL: 'mul_v4_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <4 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %1
+;
+
+  %1 = mul <4 x i16>  %a, %b
+  ret <4 x i16> %1
+}
+define <4 x i32> @mul_v4_i32(<4 x i32>  %a, <4 x i32> %b) {
+; CHECK-LABEL: 'mul_v4_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = mul <4 x i32> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %1
+;
+
+  %1 = mul <4 x i32>  %a, %b
+  ret <4 x i32> %1
+}
+define <4 x i64> @mul_v4_i64(<4 x i64>  %a, <4 x i64> %b) {
+; CHECK-LABEL: 'mul_v4_i64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %1 = mul <4 x i64> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
+;
+
+  %1 = mul <4 x i64>  %a, %b
+  ret <4 x i64> %1
+}
+define <8 x i8> @mul_v8_i8(<8 x i8>  %a, <8 x i8> %b) {
+; CHECK-LABEL: 'mul_v8_i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <8 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %1
+;
+
+  %1 = mul <8 x i8>  %a, %b
+  ret <8 x i8> %1
+}
+define <8 x i16> @mul_v8_i16(<8 x i16>  %a, <8 x i16> %b) {
+; CHECK-LABEL: 'mul_v8_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = mul <8 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %1
+;
+
+  %1 = mul <8 x i16>  %a, %b
+  ret <8 x i16> %1
+}
+define <8 x i32> @mul_v8_i32(<8 x i32>  %a, <8 x i32> %b) {
+; CHECK-LABEL: 'mul_v8_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = mul <8 x i32> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %1
+;
+
+  %1 = mul <8 x i32>  %a, %b
+  ret <8 x i32> %1
+}
+define <8 x i64> @mul_v8_i64(<8 x i64>  %a, <8 x i64> %b) {
+; CHECK-LABEL: 'mul_v8_i64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %1 = mul <8 x i64> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %1
+;
+
+  %1 = mul <8 x i64>  %a, %b
+  ret <8 x i64> %1
+}
+define <16 x i8> @mul_v16_i8(<16 x i8>  %a, <16 x i8> %b) {
+; CHECK-LABEL: 'mul_v16_i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <16 x i8> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %1
+;
+
+  %1 = mul <16 x i8>  %a, %b
+  ret <16 x i8> %1
+}
+define <16 x i16> @mul_v16_i16(<16 x i16>  %a, <16 x i16> %b) {
+; CHECK-LABEL: 'mul_v16_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = mul <16 x i16> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %1
+;
+
+  %1 = mul <16 x i16>  %a, %b
+  ret <16 x i16> %1
+}
+define <16 x i32> @mul_v16_i32(<16 x i32>  %a, <16 x i32> %b) {
+; CHECK-LABEL: 'mul_v16_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %1 = mul <16 x i32> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %1
+;
+
+  %1 = mul <16 x i32>  %a, %b
+  ret <16 x i32> %1
+}
+define <16 x i64> @mul_v16_i64(<16 x i64>  %a, <16 x i64> %b) {
+; CHECK-LABEL: 'mul_v16_i64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %1 = mul <16 x i64> %a, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i64> %1
+;
+
+  %1 = mul <16 x i64>  %a, %b
+  ret <16 x i64> %1
+}
Index: test/CodeGen/ARM/vmul.ll
===================================================================
--- test/CodeGen/ARM/vmul.ll
+++ test/CodeGen/ARM/vmul.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 
 define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@@ -36,6 +37,156 @@
 	ret <2 x float> %tmp3
 }
 
+
+define <2 x i64> @vmuli64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+; CHECK-LABEL: vmuli64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    vrev64.32 q10, q8
+; CHECK-NEXT:    vmovn.i64 d16, q8
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vmul.i32 q10, q10, q9
+; CHECK-NEXT:    vmovn.i64 d17, q9
+; CHECK-NEXT:    vpaddl.u32 q10, q10
+; CHECK-NEXT:    vshl.i64 q9, q10, #32
+; CHECK-NEXT:    vmlal.u32 q9, d17, d16
+; CHECK-NEXT:    vmov r0, r1, d18
+; CHECK-NEXT:    vmov r2, r3, d19
+; CHECK-NEXT:    bx lr
+  %tmp1 = load <2 x i64>, <2 x i64>* %A
+  %tmp2 = load <2 x i64>, <2 x i64>* %B
+  %tmp3 = mul <2 x i64> %tmp1, %tmp2
+  ret <2 x i64> %tmp3
+}
+
+
+define <2 x i64> @vmuli64_lo_lo(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+; The equivalent of pmuludq for ARM. Codegen needs improvement.
+; TODO: The mask is not required.
+; The important thing for now is a single multiply.
+; CHECK-LABEL: vmuli64_lo_lo:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i64 q8, #0xffffffff
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
+; CHECK-NEXT:    vand q9, q9, q8
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
+; CHECK-NEXT:    vand q8, q10, q8
+; CHECK-NEXT:    vmovn.i64 d18, q9
+; CHECK-NEXT:    vmovn.i64 d16, q8
+; CHECK-NEXT:    vmull.u32 q8, d16, d18
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+;CHECK; vand            q8, q10, q8
+  %tmp1 = load <2 x i64>, <2 x i64>* %A
+  %tmp2 = load <2 x i64>, <2 x i64>* %B
+  %tmp1mask = and <2 x i64> %tmp1, <i64 4294967295, i64 4294967295>
+  %tmp2mask = and <2 x i64> %tmp2, <i64 4294967295, i64 4294967295>
+  %ret = mul nuw <2 x i64> %tmp1mask, %tmp2mask
+  ret <2 x i64> %ret
+}
+
+
+define <2 x i64> @vmuli64_hi_all(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+; FIXME: The mask is not required.
+; The important thing for now is a single multiply.
+; CHECK-LABEL: vmuli64_hi_all:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i64 q8, #0xffffffff00000000
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vand q8, q9, q8
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r1]
+; CHECK-NEXT:    vmovn.i64 d18, q10
+; CHECK-NEXT:    vshrn.i64 d16, q8, #32
+; CHECK-NEXT:    vmull.u32 q8, d16, d18
+; CHECK-NEXT:    vshl.i64 q8, q8, #32
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %tmp1 = load <2 x i64>, <2 x i64>* %A
+  %tmp2 = load <2 x i64>, <2 x i64>* %B
+  %tmp1mask = and <2 x i64> %tmp1, <i64 -4294967296, i64 -4294967296>
+  %ret = mul nuw <2 x i64> %tmp1mask, %tmp2
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @vmuli64_lo_all(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+; FIXME: the mask is not required.
+; The important thing for now is two multiplies.
+; CHECK-LABEL: vmuli64_lo_all:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i64 q9, #0xffffffff
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    vand q9, q10, q9
+; CHECK-NEXT:    vshrn.i64 d20, q8, #32
+; CHECK-NEXT:    vmovn.i64 d18, q9
+; CHECK-NEXT:    vmovn.i64 d16, q8
+; CHECK-NEXT:    vmull.u32 q10, d18, d20
+; CHECK-NEXT:    vshl.i64 q10, q10, #32
+; CHECK-NEXT:    vmlal.u32 q10, d18, d16
+; CHECK-NEXT:    vmov r0, r1, d20
+; CHECK-NEXT:    vmov r2, r3, d21
+; CHECK-NEXT:    bx lr
+  %tmp1 = load <2 x i64>, <2 x i64>* %A
+  %tmp2 = load <2 x i64>, <2 x i64>* %B
+  %tmp1mask = and <2 x i64> %tmp1, <i64 4294967295, i64 4294967295>
+  %ret = mul nuw <2 x i64> %tmp1mask, %tmp2
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @vmuli64_constant(<2 x i64> %A) nounwind {
+; CHECK-LABEL: vmuli64_constant:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vldr d18, .LCPI8_0
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vmovn.i64 d19, q8
+; CHECK-NEXT:    vmull.u32 q10, d19, d18
+; CHECK-NEXT:    vldr d18, .LCPI8_1
+; CHECK-NEXT:    vshrn.i64 d16, q8, #32
+; CHECK-NEXT:    vmlal.u32 q10, d16, d18
+; CHECK-NEXT:    vshl.i64 q8, q10, #32
+; CHECK-NEXT:    vmlal.u32 q8, d19, d18
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 3
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI8_0:
+; CHECK-NEXT:    .long 287 @ 0x11f
+; CHECK-NEXT:    .long 287 @ 0x11f
+; CHECK-NEXT:  .LCPI8_1:
+; CHECK-NEXT:    .long 1912275952 @ 0x71fb03f0
+; CHECK-NEXT:    .long 1912275952 @ 0x71fb03f0
+  %ret = mul <2 x i64> %A, <i64 1234567889904, i64 1234567889904>
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @vmuli64_lo_constant(<2 x i64> %A) nounwind {
+; CHECK-LABEL: vmuli64_lo_constant:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vldr d18, .LCPI9_0
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vshrn.i64 d19, q8, #32
+; CHECK-NEXT:    vmovn.i64 d16, q8
+; CHECK-NEXT:    vmull.u32 q10, d19, d18
+; CHECK-NEXT:    vshl.i64 q10, q10, #32
+; CHECK-NEXT:    vmlal.u32 q10, d16, d18
+; CHECK-NEXT:    vmov r0, r1, d20
+; CHECK-NEXT:    vmov r2, r3, d21
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 3
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI9_0:
+; CHECK-NEXT:    .long 1234 @ 0x4d2
+; CHECK-NEXT:    .long 1234 @ 0x4d2
+  %ret = mul <2 x i64> %A, <i64 1234, i64 1234>
+  ret <2 x i64> %ret
+}
+
+
 define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vmulp8:
 ;CHECK: vmul.p8