Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -7384,9 +7384,206 @@ return false; } +/// Generates a v2i64 multiply routine for NEON. +/// Uses one of two versions of the 64-bit multiply algorithm. +/// Multiplications take a maximum of 11 cycles, possibly fewer depending on +/// whether an operand is constant or interleaved beforehand. +/// See the implementation for details. +static SDValue LowerNEONv2i64MUL(SDValue &Op, SelectionDAG &DAG, + const EVT &VT) { + // The simplest way to efficiently do a v2i64 multiply on NEON is to use the + // exact same method as used for SSE, to be referred to as "ssemul": + // vshrn.i64 topHi, top, #32 @ v2i32 topHi = top >> 32; + // vmovn.i64 topLo, top @ v2i32 topLo = top & 0xFFFFFFFF; + // vshrn.i64 botHi, bot, #32 @ v2i32 botHi = bot >> 32; + // vmovn.i64 botLo, bot @ v2i32 botLo = bot & 0xFFFFFFFF; + // vmull.u32 ret64, topHi, botLo @ v2i64 ret64 = (v2i64)topHi * botLo; + // vmlal.u64 ret64, topLo, botHi @ ret64 += (v2i64)topLo * botHi; + // vshl.i64 ret64, ret64, #32 @ ret64 <<= 32; + // vmlal.u32 ret64, topLo, botLo @ ret64 += (v2i64)topLo * botLo; + // However, a second method, "twomul", is four bytes shorter with the same + // timing (11 cycles). + // vmovn.i64 topLo, top @ v2i64 topLo = top & 0xFFFFFFFF; + // vmovn.i64 botLo, bot @ v2i64 botLo = bot & 0xFFFFFFFF; + // @ v4i32 bot32 = (v4i32)bot; + // vrev64.32 botRe, bot @ v4i32 botRe = (v4i32) { + // @ bot32[1], bot32[0], + // @ bot32[3], bot32[2] + // @ }; + // vmul.i32 botRe, botRe, top @ botRe *= (v4i32)top; + // vpaddl.u32 botRe, botRe @ top = (v2i64) { + // @ (u64)botRe[0] + botRe[1], + // @ (u64)botRe[2] + botRe[3] + // @ } + // vshl.i64 top, botRe, #32 @ top <<= 32; + // vmlal.u32 top, topLo, botLo @ top += (v2i64)topLo * botLo; + // However, ssemul can be simplified a lot, in which it becomes the better + // option. This can be in one or more of the following ways: + // - The high or low bits of an operand are determined to be zero. + // - One or more operands can be interleaved into high and low bits + // beforehand i.e. changing a vld1q.64 to vld2.32 or preswapping a + // constant. This makes it so we don't have to waste two cycles on + // vshrn/vmovn. While pointer loading has yet to be implemented, + // constant swapping is mostly complete. + // In the case of the interleaved version, it would look like this: + // vld2.32 {topLo,topHi} [topPtr] + // vld2.32 {botLo,botHi} [botPtr] + // vmull.u32 ret64, topHi, botLo + // vmlal.u64 ret64, topLo, botHi + // vshl.i64 ret64, ret64, #32 + // vmlal.u32 ret64, topLo, botLo + // This optimization only works if the pointer hasn't been loaded and is + // only used for the multiply, as swapping back removes the benefit. + SDLoc DL(Op); + SDValue Top = Op.getOperand(0); + SDValue Bot = Op.getOperand(1); + + APInt LowerBitsMask = APInt::getLowBitsSet(64, 32); + APInt UpperBitsMask = APInt::getHighBitsSet(64, 32); + + bool TopLoIsZero, BotLoIsZero, TopHiIsZero, BotHiIsZero; + bool TopIsInterleaved = false, BotIsInterleaved = false; + std::pair TopPair, BotPair; + // If we have a constant operand, and we only use that operand once, + // we can interleave the bits at compile time to avoid vshrn and vmovn. + // The goal is to have a 0, 2, 1, 3 shuffle of the multiple reinterpreted + // as a v4i32. + // TODO: Make this use one load instead of two. + // Currently, this saves one cycle, but it can save two if we can + // combine the loads together. + // TODO: Detect if we use a constant multiple times, but only for + //multiplication. + if (ISD::isBuildVectorOfConstantSDNodes(Top.getOperand(0).getNode()) + && Top.getOperand(0).hasOneUse()) { + // Bitcast to v4i32. + SDValue TopRaw = DAG.getBitcast(MVT::v4i32, Top.getOperand(0)); + SmallVector RawValues; + // Gather up all of the values. + for (int i = 0; i < 4; i++) { + RawValues.push_back(uint32_t(TopRaw.getConstantOperandVal(i))); + } + // Might as well do the zero calculations while we have easy integers. + TopLoIsZero = (RawValues[0] == 0 && RawValues[2] == 0); + TopHiIsZero = (RawValues[1] == 0 && RawValues[3] == 0); + + SmallVector Reordered; + // 0, 2, 1, 3 puts the low bits first and the high bits last. + Reordered.push_back(DAG.getConstant(RawValues[0], DL, MVT::i32)); + Reordered.push_back(DAG.getConstant(RawValues[2], DL, MVT::i32)); + Reordered.push_back(DAG.getConstant(RawValues[1], DL, MVT::i32)); + Reordered.push_back(DAG.getConstant(RawValues[3], DL, MVT::i32)); + // Create the vector + SDValue TopInterleaved = DAG.getBuildVector(MVT::v4i32, DL, Reordered); + // Split it into two halves, topLo and topHi. + TopPair = DAG.SplitVector(TopInterleaved, DL, MVT::v2i32, MVT::v2i32); + // Mark that we don't need to separate these later. + TopIsInterleaved = true; + } else { + KnownBits TopKnown = DAG.computeKnownBits(Top); + TopLoIsZero = LowerBitsMask.isSubsetOf(TopKnown.Zero); + TopHiIsZero = UpperBitsMask.isSubsetOf(TopKnown.Zero); + } + + if (ISD::isBuildVectorOfConstantSDNodes(Bot.getOperand(0).getNode()) + && Bot.getOperand(0).hasOneUse()) { + SDValue BotRaw = DAG.getBitcast(MVT::v4i32, Bot.getOperand(0)); + SmallVector RawValues; + for (int i = 0; i < 4; i++) { + RawValues.push_back(uint32_t(BotRaw.getConstantOperandVal(i))); + } + BotLoIsZero = (RawValues[0] == 0 && RawValues[2] == 0); + BotHiIsZero = (RawValues[1] == 0 && RawValues[3] == 0); + SmallVector Reordered; + // 0, 2, 1, 3 puts the low bits first and the high bits last. + Reordered.push_back(DAG.getConstant(RawValues[0], DL, MVT::i32)); + Reordered.push_back(DAG.getConstant(RawValues[2], DL, MVT::i32)); + Reordered.push_back(DAG.getConstant(RawValues[1], DL, MVT::i32)); + Reordered.push_back(DAG.getConstant(RawValues[3], DL, MVT::i32)); + SDValue BotInterleaved = DAG.getBuildVector(MVT::v4i32, DL, Reordered); + BotPair = DAG.SplitVector(BotInterleaved, DL, MVT::v2i32, MVT::v2i32); + BotIsInterleaved = true; + } else { + KnownBits BotKnown = DAG.computeKnownBits(Bot); + BotLoIsZero = LowerBitsMask.isSubsetOf(BotKnown.Zero); + BotHiIsZero = UpperBitsMask.isSubsetOf(BotKnown.Zero); + } + + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue C32 = DAG.getConstant(32, DL, MVT::i32); + + // Either way, we need the low bits and TopLo * BotLo. + // vmovn.i64 + SDValue TopLo = + TopIsInterleaved ? TopPair.first + : DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, Top); + // vmovn.i64 + SDValue BotLo = + BotIsInterleaved ? BotPair.first + : DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, Bot); + + + SDValue TopLoBotLo = Zero; + if (!TopLoIsZero && !BotLoIsZero) + // vmull.u32 + TopLoBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, TopLo, BotLo); + + // Don't go any further if we are only multiplying low bits. + // This is needed until further optimizations can be made. + if (TopHiIsZero && BotHiIsZero) + return TopLoBotLo; + + // The following block is the twomul routine. We only use it when we + // can't tell if any of the words are zero, and we are not interleaving. + if (!TopIsInterleaved && !BotIsInterleaved + && !TopLoIsZero && !BotLoIsZero && !TopHiIsZero && !BotHiIsZero) { + Bot = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, Bot); + // vrev64.32 + SDValue BotRev = DAG.getNode(ARMISD::VREV64, DL, MVT::v4i32, Bot); + Top = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, Top); + // vmul.i32 + SDValue CrossMul = DAG.getNode(ISD::MUL, DL, MVT::v4i32, BotRev, Top); + // vpaddl.u32 + SDValue Merged = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v2i64, + DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, MVT::i32), + CrossMul); + // vshl.i64 + SDValue Shifted = DAG.getNode(ARMISD::VSHL, DL, VT, Merged, C32); + return DAG.getNode(ISD::ADD, DL, VT, Shifted, TopLoBotLo); + } + + // This is the ssemul routine. + SDValue TopLoBotHi = Zero; + if (!TopLoIsZero && !BotHiIsZero) { + // vshrn.i64 + SDValue BotHi = + BotIsInterleaved ? BotPair.second + : DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, + DAG.getNode(ARMISD::VSHRu, DL, VT, Bot, C32)); + // vmlal.u32 (merges with add below) + TopLoBotHi = DAG.getNode(ARMISD::VMULLu, DL, VT, TopLo, BotHi); + } + SDValue TopHiBotLo = Zero; + if (!TopHiIsZero && !BotLoIsZero) { + // vshrn.i64 + SDValue TopHi = + TopIsInterleaved ? TopPair.second + : DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, + DAG.getNode(ARMISD::VSHRu, DL, VT, Top, C32)); + // vmlal.u32 (merges with add below) + TopHiBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, TopHi, BotLo); + } + + // (optimized to vmlal.u32) + SDValue High = DAG.getNode(ISD::ADD, DL, VT, TopLoBotHi, TopHiBotLo); + // vshl.i64 + SDValue Shifted = DAG.getNode(ARMISD::VSHL, DL, VT, High, C32); + // (optimized to vmlal.u32) + return DAG.getNode(ISD::ADD, DL, VT, TopLoBotLo, Shifted); +} + static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that - // VMULL can be detected. Otherwise v2i64 multiplications are not legal. + // VMULL can be detected. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); @@ -7420,12 +7617,11 @@ } if (!NewOpc) { - if (VT == MVT::v2i64) - // Fall through to expand this. It is not legal. - return SDValue(); - else - // Other vector multiplications are legal. - return Op; + if (VT == MVT::v2i64) { + return LowerNEONv2i64MUL(Op, DAG, VT); + } + // Other vector multiplications are legal. + return Op; } } Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -487,7 +487,7 @@ static const CostTblEntry CostTbl[] = { // Division. // These costs are somewhat random. Choose a cost of 20 to indicate that - // vectorizing devision (added function call) is going to be very expensive. + // vectorizing division (added function call) is going to be very expensive. // Double registers types. { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, @@ -522,12 +522,19 @@ { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, - // Multiplication. + // Multiplication. Not sure what exact value to use here, but the main thing + // is that it is cheaper to multiply if we are already in a vector. + { ISD::MUL, MVT::v2i64, 8 }, }; - if (ST->hasNEON()) + if (ST->hasNEON()) { + // Multiply by constant is a little cheaper. + if (ISDOpcode == ISD::MUL && LT.second == MVT::v2i64 + && Op2Info == TargetTransformInfo::OK_UniformConstantValue) + return LT.first * 7; if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) return LT.first * Entry->Cost; + } int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); Index: test/Analysis/CostModel/ARM/mult.ll =================================================================== --- test/Analysis/CostModel/ARM/mult.ll +++ test/Analysis/CostModel/ARM/mult.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 | FileCheck %s + +define <2 x i8> @mul_v2_i8(<2 x i8> %a, <2 x i8> %b) { +; CHECK-LABEL: 'mul_v2_i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <2 x i8> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %1 +; + + %1 = mul <2 x i8> %a, %b + ret <2 x i8> %1 +} +define <2 x i16> @mul_v2_i16(<2 x i16> %a, <2 x i16> %b) { +; CHECK-LABEL: 'mul_v2_i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <2 x i16> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %1 +; + + %1 = mul <2 x i16> %a, %b + ret <2 x i16> %1 +} +define <2 x i32> @mul_v2_i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: 'mul_v2_i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <2 x i32> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1 +; + + %1 = mul <2 x i32> %a, %b + ret <2 x i32> %1 +} +define <2 x i64> @mul_v2_i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: 'mul_v2_i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %1 = mul <2 x i64> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %1 +; + + %1 = mul <2 x i64> %a, %b + ret <2 x i64> %1 +} +define <4 x i8> @mul_v4_i8(<4 x i8> %a, <4 x i8> %b) { +; CHECK-LABEL: 'mul_v4_i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <4 x i8> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %1 +; + + %1 = mul <4 x i8> %a, %b + ret <4 x i8> %1 +} +define <4 x i16> @mul_v4_i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: 'mul_v4_i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <4 x i16> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %1 +; + + %1 = mul <4 x i16> %a, %b + ret <4 x i16> %1 +} +define <4 x i32> @mul_v4_i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: 'mul_v4_i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = mul <4 x i32> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %1 +; + + %1 = mul <4 x i32> %a, %b + ret <4 x i32> %1 +} +define <4 x i64> @mul_v4_i64(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: 'mul_v4_i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %1 = mul <4 x i64> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1 +; + + %1 = mul <4 x i64> %a, %b + ret <4 x i64> %1 +} +define <8 x i8> @mul_v8_i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: 'mul_v8_i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <8 x i8> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %1 +; + + %1 = mul <8 x i8> %a, %b + ret <8 x i8> %1 +} +define <8 x i16> @mul_v8_i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: 'mul_v8_i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = mul <8 x i16> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %1 +; + + %1 = mul <8 x i16> %a, %b + ret <8 x i16> %1 +} +define <8 x i32> @mul_v8_i32(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: 'mul_v8_i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = mul <8 x i32> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %1 +; + + %1 = mul <8 x i32> %a, %b + ret <8 x i32> %1 +} +define <8 x i64> @mul_v8_i64(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: 'mul_v8_i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %1 = mul <8 x i64> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %1 +; + + %1 = mul <8 x i64> %a, %b + ret <8 x i64> %1 +} +define <16 x i8> @mul_v16_i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: 'mul_v16_i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = mul <16 x i8> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %1 +; + + %1 = mul <16 x i8> %a, %b + ret <16 x i8> %1 +} +define <16 x i16> @mul_v16_i16(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: 'mul_v16_i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = mul <16 x i16> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %1 +; + + %1 = mul <16 x i16> %a, %b + ret <16 x i16> %1 +} +define <16 x i32> @mul_v16_i32(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: 'mul_v16_i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %1 = mul <16 x i32> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %1 +; + + %1 = mul <16 x i32> %a, %b + ret <16 x i32> %1 +} +define <16 x i64> @mul_v16_i64(<16 x i64> %a, <16 x i64> %b) { +; CHECK-LABEL: 'mul_v16_i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %1 = mul <16 x i64> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i64> %1 +; + + %1 = mul <16 x i64> %a, %b + ret <16 x i64> %1 +} Index: test/CodeGen/ARM/vmul.ll =================================================================== --- test/CodeGen/ARM/vmul.ll +++ test/CodeGen/ARM/vmul.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind { @@ -36,6 +37,156 @@ ret <2 x float> %tmp3 } + +define <2 x i64> @vmuli64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +; CHECK-LABEL: vmuli64: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vrev64.32 q10, q8 +; CHECK-NEXT: vmovn.i64 d16, q8 +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vmul.i32 q10, q10, q9 +; CHECK-NEXT: vmovn.i64 d17, q9 +; CHECK-NEXT: vpaddl.u32 q10, q10 +; CHECK-NEXT: vshl.i64 q9, q10, #32 +; CHECK-NEXT: vmlal.u32 q9, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: bx lr + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp3 = mul <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + + +define <2 x i64> @vmuli64_lo_lo(<2 x i64>* %A, <2 x i64>* %B) nounwind { +; The equivalent of pmuludq for ARM. Codegen needs improvement. +; TODO: The mask is not required. +; The important thing for now is a single multiply. +; CHECK-LABEL: vmuli64_lo_lo: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i64 q8, #0xffffffff +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vand q9, q9, q8 +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vand q8, q10, q8 +; CHECK-NEXT: vmovn.i64 d18, q9 +; CHECK-NEXT: vmovn.i64 d16, q8 +; CHECK-NEXT: vmull.u32 q8, d16, d18 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr +;CHECK; vand q8, q10, q8 + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp1mask = and <2 x i64> %tmp1, + %tmp2mask = and <2 x i64> %tmp2, + %ret = mul nuw <2 x i64> %tmp1mask, %tmp2mask + ret <2 x i64> %ret +} + + +define <2 x i64> @vmuli64_hi_all(<2 x i64>* %A, <2 x i64>* %B) nounwind { +; FIXME: The mask is not required. +; The important thing for now is a single multiply. +; CHECK-LABEL: vmuli64_hi_all: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i64 q8, #0xffffffff00000000 +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vand q8, q9, q8 +; CHECK-NEXT: vld1.64 {d20, d21}, [r1] +; CHECK-NEXT: vmovn.i64 d18, q10 +; CHECK-NEXT: vshrn.i64 d16, q8, #32 +; CHECK-NEXT: vmull.u32 q8, d16, d18 +; CHECK-NEXT: vshl.i64 q8, q8, #32 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp1mask = and <2 x i64> %tmp1, + %ret = mul nuw <2 x i64> %tmp1mask, %tmp2 + ret <2 x i64> %ret +} + +define <2 x i64> @vmuli64_lo_all(<2 x i64>* %A, <2 x i64>* %B) nounwind { +; FIXME: the mask is not required. +; The important thing for now is two multiplies. +; CHECK-LABEL: vmuli64_lo_all: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i64 q9, #0xffffffff +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vand q9, q10, q9 +; CHECK-NEXT: vshrn.i64 d20, q8, #32 +; CHECK-NEXT: vmovn.i64 d18, q9 +; CHECK-NEXT: vmovn.i64 d16, q8 +; CHECK-NEXT: vmull.u32 q10, d18, d20 +; CHECK-NEXT: vshl.i64 q10, q10, #32 +; CHECK-NEXT: vmlal.u32 q10, d18, d16 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: bx lr + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp1mask = and <2 x i64> %tmp1, + %ret = mul nuw <2 x i64> %tmp1mask, %tmp2 + ret <2 x i64> %ret +} + +define <2 x i64> @vmuli64_constant(<2 x i64> %A) nounwind { +; CHECK-LABEL: vmuli64_constant: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vldr d18, .LCPI8_0 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmovn.i64 d19, q8 +; CHECK-NEXT: vmull.u32 q10, d19, d18 +; CHECK-NEXT: vldr d18, .LCPI8_1 +; CHECK-NEXT: vshrn.i64 d16, q8, #32 +; CHECK-NEXT: vmlal.u32 q10, d16, d18 +; CHECK-NEXT: vshl.i64 q8, q10, #32 +; CHECK-NEXT: vmlal.u32 q8, d19, d18 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .long 287 @ 0x11f +; CHECK-NEXT: .long 287 @ 0x11f +; CHECK-NEXT: .LCPI8_1: +; CHECK-NEXT: .long 1912275952 @ 0x71fb03f0 +; CHECK-NEXT: .long 1912275952 @ 0x71fb03f0 + %ret = mul <2 x i64> %A, + ret <2 x i64> %ret +} + +define <2 x i64> @vmuli64_lo_constant(<2 x i64> %A) nounwind { +; CHECK-LABEL: vmuli64_lo_constant: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vldr d18, .LCPI9_0 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vshrn.i64 d19, q8, #32 +; CHECK-NEXT: vmovn.i64 d16, q8 +; CHECK-NEXT: vmull.u32 q10, d19, d18 +; CHECK-NEXT: vshl.i64 q10, q10, #32 +; CHECK-NEXT: vmlal.u32 q10, d16, d18 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .long 1234 @ 0x4d2 +; CHECK-NEXT: .long 1234 @ 0x4d2 + %ret = mul <2 x i64> %A, + ret <2 x i64> %ret +} + + define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK-LABEL: vmulp8: ;CHECK: vmul.p8