Index: Target/ARM/ARMISelLowering.cpp =================================================================== --- Target/ARM/ARMISelLowering.cpp +++ Target/ARM/ARMISelLowering.cpp @@ -7386,7 +7386,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that - // VMULL can be detected. Otherwise v2i64 multiplications are not legal. + // VMULL can be detected. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); @@ -7420,10 +7420,92 @@ } if (!NewOpc) { - if (VT == MVT::v2i64) - // Fall through to expand this. It is not legal. - return SDValue(); - else + if (VT == MVT::v2i64) { + // One optimal way of doing an i64x2 is the same way as for SSE: + // vshrn.i64 topHi, top, #32 @ v2i32 topHi = top >> 32; + // vmovn.i64 topLo, top @ v2i32 topLo = top & 0xFFFFFFFF; + // vshrn.i64 botHi, bot, #32 @ v2i32 botHi = bot >> 32; + // vmovn.i64 botLo, bot @ v2i32 botLo = bot & 0xFFFFFFFF; + // vmull.u32 ret64, topHi, botLo @ v2i64 ret64 = (v2i64)topHi * (v2i64)botLo; + // vmlal.u64 ret64, topLo, botHi @ ret64 += (v2i64)topLo * (v2i64)botHi; + // vshl.i64 ret64, ret64, #32 @ ret64 <<= 32; + // vmlal.u32 ret64, topLo, botLo @ ret64 += (v2i64)topLo * (v2i64)botLo; + // TODO: Implement the faster but less modular way, which is this: + // vmovn.i64 topLo, top @ v2i64 topLo = top & 0xFFFFFFFF; + // vmovn.i64 botLo, bot @ v2i64 botLo = bot & 0xFFFFFFFF; + // @ v4i32 bot32 = (v4i32)bot; + // vrev64.32 botRe, bot @ v4i32 botRe = (v4i32) { + // @ bot32[1], bot32[0], + // @ bot32[3], bot32[2] + // @ }; + // vmul.i32 botRe, botRe, top @ botRe *= (v4i32)top; + // vpaddl.u32 botRe, botRe @ top = (v2i64) { + // @ (u64)botRe[0] + (u64)botRe[1], + // @ (u64)botRe[2] + (u64)botRe[3] + // @ } + // vshl.i64 top, botRe, #32 @ top <<= 32; + // vmlal.u32 top, topLo, botLo @ top += (v2i64)topLo * (v2i64)botLo; + // Also, ideally, make it so optimizations can interleave one or more + // loads as a uint32x2x2_t for the first variant to avoid vshrn/vmovn + // like so: + // vld2.32 {topLo,topHi} [topPtr] + // vld2.32 {botLo,botHi} [botPtr] + // vmull.u32 ret64, topHi, botLo + // vmlal.u64 ret64, topLo, botHi + // vshl.i64 ret64, ret64, #32 + // vmlal.u32 ret64, topLo, botLo + // This optimization only works if the pointer hasn't been loaded and is + // only used for the multiply, as swapping back removes the benefit. + // Also, (top & 0xFFFFFFFF) * (bot & 0xFFFFFFFF) should not generate a + // mask. + SDLoc DL(Op); + SDValue top = Op.getOperand(0); + SDValue bot = Op.getOperand(1); + KnownBits topKnown = DAG.computeKnownBits(top); + KnownBits botKnown = DAG.computeKnownBits(bot); + + APInt LowerBitsMask = APInt::getLowBitsSet(64, 32); + bool topLoIsZero = LowerBitsMask.isSubsetOf(topKnown.Zero); + bool botLoIsZero = LowerBitsMask.isSubsetOf(botKnown.Zero); + + APInt UpperBitsMask = APInt::getHighBitsSet(64, 32); + bool topHiIsZero = UpperBitsMask.isSubsetOf(topKnown.Zero); + bool botHiIsZero = UpperBitsMask.isSubsetOf(botKnown.Zero); + SDValue topLo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, top); + SDValue botLo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, bot); + + SDValue c32 = DAG.getConstant(32, DL, MVT::i32); + SDValue Zero = DAG.getConstant(0, DL, VT); + + SDValue topLoBotLo = Zero; + if (!topLoIsZero && !botLoIsZero) + topLoBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, topLo, botLo); + + // Don't go any further if we are only multiplying low bits. + if (topHiIsZero && botHiIsZero) + return topLoBotLo; + + SDValue topLoBotHi = Zero; + if (!topLoIsZero && !botHiIsZero) { + SDValue botHi = + DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, + DAG.getNode(ARMISD::VSHRu, DL, VT, bot, c32)); + topLoBotHi = DAG.getNode(ARMISD::VMULLu, DL, VT, topLo, botHi); + } + SDValue topHiBotLo = Zero; + if (!topHiIsZero && !botLoIsZero) { + SDValue topHi = + DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, + DAG.getNode(ARMISD::VSHRu, DL, VT, top, c32)); + topHiBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, topHi, botLo); + } + + // (optimized to vmlal.u32) + SDValue Hi = DAG.getNode(ISD::ADD, DL, VT, topLoBotHi, topHiBotLo); + Hi = DAG.getNode(ARMISD::VSHL, DL, VT, Hi, c32); + // (optimized to vmlal.u32) + return DAG.getNode(ISD::ADD, DL, VT, topLoBotLo, Hi); + } else // Other vector multiplications are legal. return Op; } Index: Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- Target/ARM/ARMTargetTransformInfo.cpp +++ Target/ARM/ARMTargetTransformInfo.cpp @@ -523,6 +523,9 @@ { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, // Multiplication. + // TODO: Are these correct? + { ISD::UMULO, MVT::v2i64, 11}, + { ISD::SMULO, MVT::v2i64, 11}, }; if (ST->hasNEON())