Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -252,6 +252,8 @@ SDNode *SelectConcatVector(SDNode *N); + SDNode *SelectSMLAWSMULW(SDNode *N); + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -2467,6 +2469,135 @@ return nullptr; } +static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1, + bool Accumulate) { + // For SM*WB, we need to some form of sext. + // For SM*WT, we need to search for (sra X, 16) + // Src1 then gets set to X. + if ((SignExt.getOpcode() == ISD::SIGN_EXTEND || + SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG || + SignExt.getOpcode() == ISD::AssertSext) && + SignExt.getValueType() == MVT::i32) { + + *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB; + Src1 = SignExt.getOperand(0); + return true; + } + + if (SignExt.getOpcode() != ISD::SRA) + return false; + + ConstantSDNode *SRASrc1 = dyn_cast(SignExt.getOperand(1)); + if (!SRASrc1 || SRASrc1->getZExtValue() != 16) + return false; + + SDValue Op0 = SignExt.getOperand(0); + + // The sign extend operand for SM*WB could be generated by a shl and ashr. + if (Op0.getOpcode() == ISD::SHL) { + SDValue SHL = Op0; + ConstantSDNode *SHLSrc1 = dyn_cast(SHL.getOperand(1)); + if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16) + return false; + + *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB; + Src1 = Op0.getOperand(0); + return true; + } + *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT; + Src1 = SignExt.getOperand(0); + return true; +} + +static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0, + SDValue &Src1, bool Accumulate) { + // First we look for: + // (add (or (srl ?, 16), (shl ?, 16))) + if (OR.getOpcode() != ISD::OR) + return false; + + SDValue SRL = OR.getOperand(0); + SDValue SHL = OR.getOperand(1); + + if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { + SRL = OR.getOperand(1); + SHL = OR.getOperand(0); + if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) + return false; + } + + ConstantSDNode *SRLSrc1 = dyn_cast(SRL.getOperand(1)); + ConstantSDNode *SHLSrc1 = dyn_cast(SHL.getOperand(1)); + if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 || + SHLSrc1->getZExtValue() != 16) + return false; + + // The first operands to the shifts need to be the two results from the + // same smul_lohi node. + if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || + SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) + return false; + + SDNode *SMULLOHI = SRL.getOperand(0).getNode(); + if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || + SHL.getOperand(0) != SDValue(SMULLOHI, 1)) + return false; + + // Now we have: + // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) + // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. + // For SMLAWB the 16-bit value will signed extended somehow. + // For SMLAWT only the SRA is required. + + // Check both sides of SMUL_LOHI + if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) { + Src0 = SMULLOHI->getOperand(1); + } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1, + Accumulate)) { + Src0 = SMULLOHI->getOperand(0); + } else { + return false; + } + return true; +} + +SDNode *ARMDAGToDAGISel::SelectSMLAWSMULW(SDNode *N) { + SDLoc dl(N); + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + SDValue A, B; + unsigned Opc = 0; + + if (N->getOpcode() == ISD::ADD) { + if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR) + return nullptr; + + SDValue Acc; + if (SearchSignedMulLong(Src0, &Opc, A, B, true)) { + Acc = Src1; + } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) { + Acc = Src0; + } else { + return nullptr; + } + if (Opc == 0) + return nullptr; + + SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops); + } else if (N->getOpcode() == ISD::OR && + SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) { + if (Opc == 0) + return nullptr; + + SDValue Ops[] = { A, B, getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32)}; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + } + return nullptr; +} + SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) { // The only time a CONCAT_VECTORS operation can have legal types is when // two 64-bit vectors are concatenated to a 128-bit vector. @@ -2486,6 +2617,13 @@ switch (N->getOpcode()) { default: break; + case ISD::ADD: + case ISD::OR: { + SDNode *ResNode = SelectSMLAWSMULW(N); + if (ResNode) + return ResNode; + break; + } case ISD::WRITE_REGISTER: { SDNode *ResNode = SelectWriteRegister(N); if (ResNode) Index: test/CodeGen/ARM/smul.ll =================================================================== --- test/CodeGen/ARM/smul.ll +++ test/CodeGen/ARM/smul.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=arm-eabi -mcpu=generic %s -o /dev/null ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s +; RUN: llc -mtriple=thumb--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s @x = weak global i16 0 ; [#uses=1] @y = weak global i16 0 ; [#uses=0] @@ -34,3 +35,107 @@ ret i32 %tmp5 } +define i32 @f4(i32 %a, i32 %x, i32 %y) { +; CHECK-LABEL: f4 +; CHECK: smlatt + %tmp1 = ashr i32 %x, 16 + %tmp3 = ashr i32 %y, 16 + %tmp4 = mul i32 %tmp3, %tmp1 + %tmp5 = add i32 %tmp4, %a + ret i32 %tmp5 +} + +define i32 @f5(i32 %a, i16 %x, i16 %y) { +; CHECK-LABEL: f5 +; CHECK: smlabb + %tmp1 = sext i16 %x to i32 + %tmp3 = sext i16 %y to i32 + %tmp4 = mul i32 %tmp3, %tmp1 + %tmp5 = add i32 %tmp4, %a + ret i32 %tmp5 +} + +define i32 @f6(i32 %a, i16 %x, i32 %y) { +; CHECK-LABEL: f6 +; CHECK: smlabt + %tmp1 = sext i16 %x to i32 + %tmp3 = ashr i32 %y, 16 + %tmp4 = mul i32 %tmp3, %tmp1 + %tmp5 = add i32 %tmp4, %a + ret i32 %tmp5 +} + +define i32 @f7(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: f7 +; CHECK: smlawb + %shl = shl i32 %b, 16 + %shr = ashr exact i32 %shl, 16 + %conv = sext i32 %a to i64 + %conv2 = sext i32 %shr to i64 + %mul = mul nsw i64 %conv2, %conv + %shr49 = lshr i64 %mul, 16 + %conv5 = trunc i64 %shr49 to i32 + %add = add nsw i32 %conv5, %c + ret i32 %add +} + +define i32 @f8(i32 %a, i16 signext %b, i32 %c) { +; CHECK-LABEL: f8 +; CHECK: smlawb + %conv = sext i32 %a to i64 + %conv1 = sext i16 %b to i64 + %mul = mul nsw i64 %conv1, %conv + %shr5 = lshr i64 %mul, 16 + %conv2 = trunc i64 %shr5 to i32 + %add = add nsw i32 %conv2, %c + ret i32 %add +} + +define i32 @f9(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: f9 +; CHECK: smlawt + %conv = sext i32 %a to i64 + %shr = ashr i32 %b, 16 + %conv1 = sext i32 %shr to i64 + %mul = mul nsw i64 %conv1, %conv + %shr26 = lshr i64 %mul, 16 + %conv3 = trunc i64 %shr26 to i32 + %add = add nsw i32 %conv3, %c + ret i32 %add +} + +define i32 @f10(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: f10 +; CHECK: smulwb + %shl = shl i32 %b, 16 + %shr = ashr exact i32 %shl, 16 + %conv = sext i32 %a to i64 + %conv2 = sext i32 %shr to i64 + %mul = mul nsw i64 %conv2, %conv + %shr37 = lshr i64 %mul, 16 + %conv4 = trunc i64 %shr37 to i32 + ret i32 %conv4 +} + +define i32 @f11(i32 %a, i16 signext %b, i32 %c) { +; CHECK-LABEL: f11 +; CHECK: smulwb + %conv = sext i32 %a to i64 + %conv1 = sext i16 %b to i64 + %mul = mul nsw i64 %conv1, %conv + %shr4 = lshr i64 %mul, 16 + %conv2 = trunc i64 %shr4 to i32 + ret i32 %conv2 +} + +define i32 @f12(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: f12 +; CHECK: smulwt + %conv = sext i32 %a to i64 + %shr = ashr i32 %b, 16 + %conv1 = sext i32 %shr to i64 + %mul = mul nsw i64 %conv1, %conv + %shr25 = lshr i64 %mul, 16 + %conv3 = trunc i64 %shr25 to i32 + ret i32 %conv3 +} Index: test/CodeGen/ARM/smulw.ll =================================================================== --- test/CodeGen/ARM/smulw.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -mtriple=arm--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -; RUN: llc -mtriple=thumb--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s - -; We cannot codegen the smulw[bt] or smlaw[bt] instructions for these functions, -; as the top 16 bits of the result would differ - -define i32 @f1(i32 %a, i16 %b) { -; CHECK-LABEL: f1: -; CHECK: mul -; CHECK: asr - %tmp1 = sext i16 %b to i32 - %tmp2 = mul i32 %a, %tmp1 - %tmp3 = ashr i32 %tmp2, 16 - ret i32 %tmp3 -} - -define i32 @f2(i32 %a, i16 %b, i32 %c) { -; CHECK-LABEL: f2: -; CHECK: mul -; CHECK: add{{.*}}, asr #16 - %tmp1 = sext i16 %b to i32 - %tmp2 = mul i32 %a, %tmp1 - %tmp3 = ashr i32 %tmp2, 16 - %tmp4 = add i32 %tmp3, %c - ret i32 %tmp4 -}