Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -7386,7 +7386,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that - // VMULL can be detected. Otherwise v2i64 multiplications are not legal. + // VMULL can be detected. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); @@ -7420,10 +7420,92 @@ } if (!NewOpc) { - if (VT == MVT::v2i64) - // Fall through to expand this. It is not legal. - return SDValue(); - else + if (VT == MVT::v2i64) { + // One optimal way of doing an i64x2 is the same way as for SSE: + // vshrn.i64 topHi, top, #32 @ v2i32 topHi = top >> 32; + // vmovn.i64 topLo, top @ v2i32 topLo = top & 0xFFFFFFFF; + // vshrn.i64 botHi, bot, #32 @ v2i32 botHi = bot >> 32; + // vmovn.i64 botLo, bot @ v2i32 botLo = bot & 0xFFFFFFFF; + // vmull.u32 ret64, topHi, botLo @ v2i64 ret64 = (v2i64)topHi * (v2i64)botLo; + // vmlal.u64 ret64, topLo, botHi @ ret64 += (v2i64)topLo * (v2i64)botHi; + // vshl.i64 ret64, ret64, #32 @ ret64 <<= 32; + // vmlal.u32 ret64, topLo, botLo @ ret64 += (v2i64)topLo * (v2i64)botLo; + // TODO: Implement the faster but less modular way, which is this: + // vmovn.i64 topLo, top @ v2i64 topLo = top & 0xFFFFFFFF; + // vmovn.i64 botLo, bot @ v2i64 botLo = bot & 0xFFFFFFFF; + // @ v4i32 bot32 = (v4i32)bot; + // vrev64.32 botRe, bot @ v4i32 botRe = (v4i32) { + // @ bot32[1], bot32[0], + // @ bot32[3], bot32[2] + // @ }; + // vmul.i32 botRe, botRe, top @ botRe *= (v4i32)top; + // vpaddl.u32 botRe, botRe @ top = (v2i64) { + // @ (u64)botRe[0] + (u64)botRe[1], + // @ (u64)botRe[2] + (u64)botRe[3] + // @ } + // vshl.i64 top, botRe, #32 @ top <<= 32; + // vmlal.u32 top, topLo, botLo @ top += (v2i64)topLo * (v2i64)botLo; + // Also, ideally, make it so optimizations can interleave one or more + // loads as a uint32x2x2_t for the first variant to avoid vshrn/vmovn + // like so: + // vld2.32 {topLo,topHi} [topPtr] + // vld2.32 {botLo,botHi} [botPtr] + // vmull.u32 ret64, topHi, botLo + // vmlal.u64 ret64, topLo, botHi + // vshl.i64 ret64, ret64, #32 + // vmlal.u32 ret64, topLo, botLo + // This optimization only works if the pointer hasn't been loaded and is + // only used for the multiply, as swapping back removes the benefit. + // Also, (top & 0xFFFFFFFF) * (bot & 0xFFFFFFFF) should not generate a + // mask. + SDLoc DL(Op); + SDValue top = Op.getOperand(0); + SDValue bot = Op.getOperand(1); + KnownBits topKnown = DAG.computeKnownBits(top); + KnownBits botKnown = DAG.computeKnownBits(bot); + + APInt LowerBitsMask = APInt::getLowBitsSet(64, 32); + bool topLoIsZero = LowerBitsMask.isSubsetOf(topKnown.Zero); + bool botLoIsZero = LowerBitsMask.isSubsetOf(botKnown.Zero); + + APInt UpperBitsMask = APInt::getHighBitsSet(64, 32); + bool topHiIsZero = UpperBitsMask.isSubsetOf(topKnown.Zero); + bool botHiIsZero = UpperBitsMask.isSubsetOf(botKnown.Zero); + SDValue topLo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, top); + SDValue botLo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, bot); + + SDValue c32 = DAG.getConstant(32, DL, MVT::i32); + SDValue Zero = DAG.getConstant(0, DL, VT); + + SDValue topLoBotLo = Zero; + if (!topLoIsZero && !botLoIsZero) + topLoBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, topLo, botLo); + + // Don't go any further if we are only multiplying low bits. + if (topHiIsZero && botHiIsZero) + return topLoBotLo; + + SDValue topLoBotHi = Zero; + if (!topLoIsZero && !botHiIsZero) { + SDValue botHi = + DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, + DAG.getNode(ARMISD::VSHRu, DL, VT, bot, c32)); + topLoBotHi = DAG.getNode(ARMISD::VMULLu, DL, VT, topLo, botHi); + } + SDValue topHiBotLo = Zero; + if (!topHiIsZero && !botLoIsZero) { + SDValue topHi = + DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, + DAG.getNode(ARMISD::VSHRu, DL, VT, top, c32)); + topHiBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, topHi, botLo); + } + + // (optimized to vmlal.u32) + SDValue Hi = DAG.getNode(ISD::ADD, DL, VT, topLoBotHi, topHiBotLo); + Hi = DAG.getNode(ARMISD::VSHL, DL, VT, Hi, c32); + // (optimized to vmlal.u32) + return DAG.getNode(ISD::ADD, DL, VT, topLoBotLo, Hi); + } else // Other vector multiplications are legal. return Op; } Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -523,6 +523,7 @@ { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, // Multiplication. + { ISD::MUL, MVT::v2i64, 11}, }; if (ST->hasNEON()) Index: test/Analysis/CostModel/ARM/mult.ll =================================================================== --- test/Analysis/CostModel/ARM/mult.ll +++ test/Analysis/CostModel/ARM/mult.ll @@ -0,0 +1,114 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 | FileCheck %s + +define <2 x i8> @mul_v2_i8(<2 x i8> %a, <2 x i8> %b) { + ; CHECK: mul_v2_i8 + ; CHECK: cost of 1 {{.*}} mul + + %1 = mul <2 x i8> %a, %b + ret <2 x i8> %1 +} +define <2 x i16> @mul_v2_i16(<2 x i16> %a, <2 x i16> %b) { + ; CHECK: mul_v2_i16 + ; CHECK: cost of 1 {{.*}} mul + + %1 = mul <2 x i16> %a, %b + ret <2 x i16> %1 +} +define <2 x i32> @mul_v2_i32(<2 x i32> %a, <2 x i32> %b) { + ; CHECK: mul_v2_i32 + ; CHECK: cost of 1 {{.*}} mul + + %1 = mul <2 x i32> %a, %b + ret <2 x i32> %1 +} +define <2 x i64> @mul_v2_i64(<2 x i64> %a, <2 x i64> %b) { + ; CHECK: mul_v2_i64 + ; CHECK: cost of 11 {{.*}} mul + + %1 = mul <2 x i64> %a, %b + ret <2 x i64> %1 +} +define <4 x i8> @mul_v4_i8(<4 x i8> %a, <4 x i8> %b) { + ; CHECK: mul_v4_i8 + ; CHECK: cost of 1 {{.*}} mul + + %1 = mul <4 x i8> %a, %b + ret <4 x i8> %1 +} +define <4 x i16> @mul_v4_i16(<4 x i16> %a, <4 x i16> %b) { + ; CHECK: mul_v4_i16 + ; CHECK: cost of 1 {{.*}} mul + + %1 = mul <4 x i16> %a, %b + ret <4 x i16> %1 +} +define <4 x i32> @mul_v4_i32(<4 x i32> %a, <4 x i32> %b) { + ; CHECK: mul_v4_i32 + ; CHECK: cost of 2 {{.*}} mul + + %1 = mul <4 x i32> %a, %b + ret <4 x i32> %1 +} +define <4 x i64> @mul_v4_i64(<4 x i64> %a, <4 x i64> %b) { + ; CHECK: mul_v4_i64 + ; CHECK: cost of 22 {{.*}} mul + + %1 = mul <4 x i64> %a, %b + ret <4 x i64> %1 +} +define <8 x i8> @mul_v8_i8(<8 x i8> %a, <8 x i8> %b) { + ; CHECK: mul_v8_i8 + ; CHECK: cost of 1 {{.*}} mul + + %1 = mul <8 x i8> %a, %b + ret <8 x i8> %1 +} +define <8 x i16> @mul_v8_i16(<8 x i16> %a, <8 x i16> %b) { + ; CHECK: mul_v8_i16 + ; CHECK: cost of 2 {{.*}} mul + + %1 = mul <8 x i16> %a, %b + ret <8 x i16> %1 +} +define <8 x i32> @mul_v8_i32(<8 x i32> %a, <8 x i32> %b) { + ; CHECK: mul_v8_i32 + ; CHECK: cost of 4 {{.*}} mul + + %1 = mul <8 x i32> %a, %b + ret <8 x i32> %1 +} +define <8 x i64> @mul_v8_i64(<8 x i64> %a, <8 x i64> %b) { + ; CHECK: mul_v8_i64 + ; CHECK: cost of 44 {{.*}} mul + + %1 = mul <8 x i64> %a, %b + ret <8 x i64> %1 +} +define <16 x i8> @mul_v16_i8(<16 x i8> %a, <16 x i8> %b) { + ; CHECK: mul_v16_i8 + ; CHECK: cost of 1 {{.*}} mul + + %1 = mul <16 x i8> %a, %b + ret <16 x i8> %1 +} +define <16 x i16> @mul_v16_i16(<16 x i16> %a, <16 x i16> %b) { + ; CHECK: mul_v16_i16 + ; CHECK: cost of 4 {{.*}} mul + + %1 = mul <16 x i16> %a, %b + ret <16 x i16> %1 +} +define <16 x i32> @mul_v16_i32(<16 x i32> %a, <16 x i32> %b) { + ; CHECK: mul_v16_i32 + ; CHECK: cost of 8 {{.*}} mul + + %1 = mul <16 x i32> %a, %b + ret <16 x i32> %1 +} +define <16 x i64> @mul_v16_i64(<16 x i64> %a, <16 x i64> %b) { + ; CHECK: mul_v16_i64 + ; CHECK: cost of 88 {{.*}} mul + + %1 = mul <16 x i64> %a, %b + ret <16 x i64> %1 +} Index: test/CodeGen/ARM/vmul.ll =================================================================== --- test/CodeGen/ARM/vmul.ll +++ test/CodeGen/ARM/vmul.ll @@ -36,6 +36,92 @@ ret <2 x float> %tmp3 } + +define <2 x i64> @vmuli64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +; Not the best code but the cleanest. +;CHECK-LABEL: vmuli64 +;CHECK: vld1.64 {d16, d17}, [r1] +;CHECK: vld1.64 {d18, d19}, [r0] +;CHECK: vshrn.i64 d20, q8, #32 +;CHECK: vmovn.i64 d16, q8 +;CHECK: vmovn.i64 d21, q9 +;CHECK: vmull.u32 q11, d21, d20 +;CHECK: vshrn.i64 d17, q9, #32 +;CHECK: vmlal.u32 q11, d17, d16 +;CHECK: vshl.i64 q9, q11, #32 +;CHECK: vmlal.u32 q9, d21, d16 + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp3 = mul <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + + +define <2 x i64> @vmuli64_lo_lo(<2 x i64>* %A, <2 x i64>* %B) nounwind { +; The equivalent of pmuludq for ARM. Codegen needs improvement. +; TODO: The mask is not required. +; The important thing for now is a single multiply. +;CHECK-LABEL: vmuli64_lo_lo +;CHECK: vmov.i64 q8, #0xffffffff +;CHECK: vld1.64 {d18, d19}, [r1] +;CHECK: vand q9, q9, q8 +;CHECK: vld1.64 {d20, d21}, [r0] +;CHECK; vand q8, q10, q8 +;CHECK: vmovn.i64 d18, q9 +;CHECK: vmovn.i64 d16, q8 +;CHECK: vmull.u32 q8, d16, d18 + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp1mask = and <2 x i64> %tmp1, + %tmp2mask = and <2 x i64> %tmp2, + %ret = mul nuw <2 x i64> %tmp1mask, %tmp2mask + ret <2 x i64> %ret +} + + +define <2 x i64> @vmuli64_hi_all(<2 x i64>* %A, <2 x i64>* %B) nounwind { +; TODO: The mask is not required. +; The important thing is a single multiply. +;CHECK-LABEL: vmuli64_hi_all +;CHECK: vmov.i64 q8, #0xffffffff00000000 +;CHECK: vld1.64 {d18, d19}, [r0] +;CHECK: vand q8, q9, q8 +;CHECK: vld1.64 {d20, d21}, [r1] +;CHECK: vmovn.i64 d18, q10 +;CHECK: vshrn.i64 d16, q8, #32 +;CHECK: vmull.u32 q8, d16, d18 +;CHECK: vshl.i64 q8, q8, #32 + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp1mask = and <2 x i64> %tmp1, + %ret = mul nuw <2 x i64> %tmp1mask, %tmp2 + ret <2 x i64> %ret +} + + +define <2 x i64> @vmuli64_lo_all(<2 x i64>* %A, <2 x i64>* %B) nounwind { +; TODO: the mask is not required. +; The important thing for now is two multiplies. +;CHECK-LABEL: vmuli64_lo_all +;CHECK: vmov.i64 q9, #0xffffffff +;CHECK: vld1.64 {d20, d21}, [r0] +;CHECK: vld1.64 {d16, d17}, [r1] +;CHECK: vand q9, q10, q9 +;CHECK: vshrn.i64 d20, q8, #32 +;CHECK: vmovn.i64 d18, q9 +;CHECK: vmovn.i64 d16, q8 +;CHECK: vmull.u32 q10, d18, d20 +;CHECK: vshl.i64 q10, q10, #32 +;CHECK: vmlal.u32 q10, d18, d16 + %tmp1 = load <2 x i64>, <2 x i64>* %A + %tmp2 = load <2 x i64>, <2 x i64>* %B + %tmp1mask = and <2 x i64> %tmp1, + %ret = mul nuw <2 x i64> %tmp1mask, %tmp2 + ret <2 x i64> %ret +} + + + define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK-LABEL: vmulp8: ;CHECK: vmul.p8