Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -247,8 +247,6 @@ void SelectConcatVector(SDNode *N); void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI); - bool trySMLAWSMULW(SDNode *N); - void SelectCMP_SWAP(SDNode *N); /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for @@ -2559,141 +2557,6 @@ return false; } -static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1, - bool Accumulate) { - // For SM*WB, we need to some form of sext. - // For SM*WT, we need to search for (sra X, 16) - // Src1 then gets set to X. - if ((SignExt.getOpcode() == ISD::SIGN_EXTEND || - SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG || - SignExt.getOpcode() == ISD::AssertSext) && - SignExt.getValueType() == MVT::i32) { - - *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB; - Src1 = SignExt.getOperand(0); - return true; - } - - if (SignExt.getOpcode() != ISD::SRA) - return false; - - ConstantSDNode *SRASrc1 = dyn_cast(SignExt.getOperand(1)); - if (!SRASrc1 || SRASrc1->getZExtValue() != 16) - return false; - - SDValue Op0 = SignExt.getOperand(0); - - // The sign extend operand for SM*WB could be generated by a shl and ashr. - if (Op0.getOpcode() == ISD::SHL) { - SDValue SHL = Op0; - ConstantSDNode *SHLSrc1 = dyn_cast(SHL.getOperand(1)); - if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16) - return false; - - *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB; - Src1 = Op0.getOperand(0); - return true; - } - *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT; - Src1 = SignExt.getOperand(0); - return true; -} - -static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0, - SDValue &Src1, bool Accumulate) { - // First we look for: - // (add (or (srl ?, 16), (shl ?, 16))) - if (OR.getOpcode() != ISD::OR) - return false; - - SDValue SRL = OR.getOperand(0); - SDValue SHL = OR.getOperand(1); - - if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { - SRL = OR.getOperand(1); - SHL = OR.getOperand(0); - if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) - return false; - } - - ConstantSDNode *SRLSrc1 = dyn_cast(SRL.getOperand(1)); - ConstantSDNode *SHLSrc1 = dyn_cast(SHL.getOperand(1)); - if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 || - SHLSrc1->getZExtValue() != 16) - return false; - - // The first operands to the shifts need to be the two results from the - // same smul_lohi node. - if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || - SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) - return false; - - SDNode *SMULLOHI = SRL.getOperand(0).getNode(); - if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || - SHL.getOperand(0) != SDValue(SMULLOHI, 1)) - return false; - - // Now we have: - // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) - // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. - // For SMLAWB the 16-bit value will signed extended somehow. - // For SMLAWT only the SRA is required. - - // Check both sides of SMUL_LOHI - if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) { - Src0 = SMULLOHI->getOperand(1); - } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1, - Accumulate)) { - Src0 = SMULLOHI->getOperand(0); - } else { - return false; - } - return true; -} - -bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) { - if (!Subtarget->hasV6Ops() || - (Subtarget->isThumb() && !Subtarget->hasThumb2())) - return false; - - SDLoc dl(N); - SDValue Src0 = N->getOperand(0); - SDValue Src1 = N->getOperand(1); - SDValue A, B; - unsigned Opc = 0; - - if (N->getOpcode() == ISD::ADD) { - if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR) - return false; - - SDValue Acc; - if (SearchSignedMulLong(Src0, &Opc, A, B, true)) { - Acc = Src1; - } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) { - Acc = Src0; - } else { - return false; - } - if (Opc == 0) - return false; - - SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl), - CurDAG->getRegister(0, MVT::i32) }; - CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops); - return true; - } else if (N->getOpcode() == ISD::OR && - SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) { - if (Opc == 0) - return false; - - SDValue Ops[] = { A, B, getAL(CurDAG, dl), - CurDAG->getRegister(0, MVT::i32)}; - CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); - return true; - } - return false; -} - /// We've got special pseudo-instructions for these void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) { unsigned Opcode; @@ -2822,11 +2685,6 @@ switch (N->getOpcode()) { default: break; - case ISD::ADD: - case ISD::OR: - if (trySMLAWSMULW(N)) - return; - break; case ISD::WRITE_REGISTER: if (tryWriteRegister(N)) return; @@ -3147,20 +3005,20 @@ } case ARMISD::SMLAL:{ if (Subtarget->isThumb()) { - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), getAL(CurDAG, dl), - CurDAG->getRegister(0, MVT::i32)}; - ReplaceNode( - N, CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops)); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32) }; + ReplaceNode(N, CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, + MVT::i32, Ops)); return; - }else{ + } else { + unsigned Opcode = Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5; SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - ReplaceNode(N, CurDAG->getMachineNode( - Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5, dl, - MVT::i32, MVT::i32, Ops)); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32, + Ops)); return; } } Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -175,9 +175,15 @@ VMULLs, // ...signed VMULLu, // ...unsigned + SMULWB, // Signed multiply word by half word, bottom + SMULWT, // Signed multiply word by half word, top UMLAL, // 64bit Unsigned Accumulate Multiply SMLAL, // 64bit Signed Accumulate Multiply UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply + SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16 + SMLALBT, // 64-bit signed accumulate multiply bottom, top 16 + SMLALTB, // 64-bit signed accumulate multiply top, bottom 16 + SMLALTT, // 64-bit signed accumulate multiply top, top 16 // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1344,6 +1344,12 @@ case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; + case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; + case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; + case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; + case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; + case ARMISD::SMULWB: return "ARMISD::SMULWB"; + case ARMISD::SMULWT: return "ARMISD::SMULWT"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; @@ -1453,6 +1459,51 @@ // Lowering Code //===----------------------------------------------------------------------===// +static bool isSRL16(const SDValue &Op) { + if (Op.getOpcode() != ISD::SRL) + return false; + if (auto Const = dyn_cast(Op.getOperand(1))) + return Const->getZExtValue() == 16; + return false; +} + +static bool isSRA16(const SDValue &Op) { + if (Op.getOpcode() != ISD::SRA) + return false; + if (auto Const = dyn_cast(Op.getOperand(1))) + return Const->getZExtValue() == 16; + return false; +} + +static bool isSHL16(const SDValue &Op) { + if (Op.getOpcode() != ISD::SHL) + return false; + if (auto Const = dyn_cast(Op.getOperand(1))) + return Const->getZExtValue() == 16; + return false; +} + +static bool isS16(const SDValue &Op, SelectionDAG &DAG) { + if (isSRA16(Op)) + return isSHL16(Op.getOperand(0)); + + return DAG.ComputeNumSignBits(Op) == 17; +} + +static SDValue getS16Val(SDNode *N, unsigned OpIdx, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (N->getOperand(OpIdx).getOpcode() == ISD::AssertSext) + return N->getOperand(OpIdx)->getOperand(0); + + if (N->getOperand(OpIdx).getOpcode() == ISD::LOAD) + return N->getOperand(OpIdx); + + ARMTargetLowering::TargetLoweringOpt TLO(DAG, true, true); + if (TLO.SimplifyDemandedBits(N, OpIdx, APInt(32, 0xFFFF), DCI)) + return N->getOperand(OpIdx); + return SDValue(); +} + /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { switch (CC) { @@ -9436,6 +9487,89 @@ return SDValue(); } +static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + if (Subtarget->isThumb()) { + if (!Subtarget->hasDSP()) + return SDValue(); + } else if (!Subtarget->hasV5TEOps()) + return SDValue(); + + // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and + // accumulates the product into a 64-bit value. The 16-bit values will + // be sign extended somehow or SRA'd into 32-bit values + // (addc (adde (mul 16bit, 16bit), lo), hi) + SDValue Mul = AddcNode->getOperand(0); + SDValue Hi = AddcNode->getOperand(1); + if (Mul.getOpcode() != ISD::MUL) { + Hi = AddcNode->getOperand(0); + Mul = AddcNode->getOperand(1); + if (Mul.getOpcode() != ISD::MUL) + return SDValue(); + } + + SDValue SRA = AddeNode->getOperand(0); + SDValue Lo = AddeNode->getOperand(1); + if (SRA.getOpcode() != ISD::SRA) { + SRA = AddeNode->getOperand(1); + Lo = AddeNode->getOperand(0); + if (SRA.getOpcode() != ISD::SRA) + return SDValue(); + } + if (auto Const = dyn_cast(SRA.getOperand(1))) { + if (Const->getZExtValue() != 31) + return SDValue(); + } else + return SDValue(); + + if (SRA.getOperand(0) != Mul) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDNode *MulNode = Mul.getNode(); + unsigned Opcode = 0; + SDValue Op0; + SDValue Op1; + + if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { + Opcode = ARMISD::SMLALBB; + Op0 = getS16Val(MulNode, 0, DAG, DCI); + Op1 = getS16Val(MulNode, 1, DAG, DCI); + } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { + Opcode = ARMISD::SMLALBT; + Op0 = getS16Val(MulNode, 0, DAG, DCI); + Op1 = Mul.getOperand(1).getOperand(0); + } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { + Opcode = ARMISD::SMLALTB; + Op0 = Mul.getOperand(0).getOperand(0); + Op1 = getS16Val(MulNode, 1, DAG, DCI); + } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { + Opcode = ARMISD::SMLALTT; + Op0 = Mul->getOperand(0).getOperand(0); + Op1 = Mul->getOperand(1).getOperand(0); + } + + if (!Op0 || !Op1) + return SDValue(); + + SDLoc dl(AddcNode); + + SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), + Op0, Op1, Lo, Hi); + // Replace the ADDs' nodes uses by the MLA node's values. + SDValue HiMLALResult(SMLAL.getNode(), 1); + SDValue LoMLALResult(SMLAL.getNode(), 0); + + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); + + // Return original node to notify the driver to stop replacing. + SDValue resNode(AddcNode, 0); + return resNode; +} + static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -9469,13 +9603,6 @@ if (AddcNode->getValueType(1) != MVT::Glue) return SDValue(); - // Check that the ADDC adds the low result of the S/UMUL_LOHI. - if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && - AddcOp0->getOpcode() != ISD::SMUL_LOHI && - AddcOp1->getOpcode() != ISD::UMUL_LOHI && - AddcOp1->getOpcode() != ISD::SMUL_LOHI) - return SDValue(); - // Look for the glued ADDE. SDNode* AddeNode = AddcNode->getGluedUser(); if (!AddeNode) @@ -9489,6 +9616,14 @@ AddeNode->getOperand(2).getValueType() == MVT::Glue && "ADDE node has the wrong inputs"); + // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it + // maybe a SMLAL which multiplies two 16-bit values. + if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && + AddcOp0->getOpcode() != ISD::SMUL_LOHI && + AddcOp1->getOpcode() != ISD::UMUL_LOHI && + AddcOp1->getOpcode() != ISD::SMUL_LOHI) + return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget); + // Check for the triangle shape. SDValue AddeOp0 = AddeNode->getOperand(0); SDValue AddeOp1 = AddeNode->getOperand(1); @@ -9645,9 +9780,74 @@ return SDValue(); } +// Try combining OR nodes to SMULWB, SMULWT. +static SDValue PerformORCombineToSMULWBT(SDNode *OR, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasV6Ops() || + (Subtarget->isThumb() && + (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) + return SDValue(); + + SDValue SRL = OR->getOperand(0); + SDValue SHL = OR->getOperand(1); + + if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { + SRL = OR->getOperand(1); + SHL = OR->getOperand(0); + } + if (!isSRL16(SRL) || !isSHL16(SHL)) + return SDValue(); + + // The first operands to the shifts need to be the two results from the + // same smul_lohi node. + if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || + SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) + return SDValue(); + + SDNode *SMULLOHI = SRL.getOperand(0).getNode(); + if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || + SHL.getOperand(0) != SDValue(SMULLOHI, 1)) + return SDValue(); + + // Now we have: + // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) + // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. + // For SMUWB the 16-bit value will signed extended somehow. + // For SMULWT only the SRA is required. + // Check both sides of SMUL_LOHI + unsigned OpIdx = 0; + SDValue OpS16 = SMULLOHI->getOperand(0); + SDValue OpS32 = SMULLOHI->getOperand(1); + + SelectionDAG &DAG = DCI.DAG; + if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { + OpIdx = 1; + OpS16 = OpS32; + OpS32 = SMULLOHI->getOperand(0); + } + + SDLoc dl(OR); + + if (isS16(OpS16, DAG)) { + if (SDValue Bottom = getS16Val(SMULLOHI, OpIdx, DAG, DCI)) { + SDValue Res = DAG.getNode(ARMISD::SMULWB, dl, MVT::i32, OpS32, Bottom); + DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); + return SDValue(OR, 0); + } + } else if (isSRA16(OpS16)) { + SDValue Res = DAG.getNode(ARMISD::SMULWT, dl, MVT::i32, OpS32, + OpS16.getOperand(0)); + DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); + return SDValue(OR, 0); + } + return SDValue(); +} + /// PerformADDCCombine - Target-specific dag combine transform from /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or /// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL +/// ISD::ADDC, ISD::ADDE and ISD::MUL to SMLAL[B|T] static SDValue PerformADDCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -9923,6 +10123,8 @@ // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) return Result; + if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) + return Result; } // The code below optimizes (or (and X, Y), Z). Index: lib/Target/ARM/ARMInstrInfo.td =================================================================== --- lib/Target/ARM/ARMInstrInfo.td +++ lib/Target/ARM/ARMInstrInfo.td @@ -92,6 +92,13 @@ SDTCisVT<1, i32>, SDTCisVT<4, i32>]>; +def SDT_LongMac : SDTypeProfile<2, 4, [SDTCisVT<0, i32>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisSameAs<0, 4>, + SDTCisSameAs<0, 5>]>; + // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; @@ -183,6 +190,13 @@ [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad]>; +def ARMsmulwb : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>; +def ARMsmulwt : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>; +def ARMsmlalbb : SDNode<"ARMISD::SMLALBB", SDT_LongMac, []>; +def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>; +def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>; +def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>; + //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // @@ -4100,13 +4114,13 @@ def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", - []>, + [(set GPR:$Rd, (ARMsmulwb GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", - []>, + [(set GPR:$Rd, (ARMsmulwt GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; } @@ -4151,16 +4165,18 @@ Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd), - (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra), IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", - []>, + [(set GPRnopc:$Rd, + (add GPRnopc:$Ra, (ARMsmulwb GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd), - (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra), IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", - []>, + [(set GPRnopc:$Rd, + (add GPRnopc:$Ra, (ARMsmulwt GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; } @@ -4170,29 +4186,28 @@ defm SMLA : AI_smla<"smla">; // Halfword multiply accumulate long: SMLAL. -def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - -def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - -def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - -def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; +class SMLAL opc1, string asm> + : AMulxyI64<0b0001010, opc1, + (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi), + IIC_iMAC64, asm, "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, + Requires<[IsARM, HasV5TE]>, + Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; + +def SMLALBB : SMLAL<0b00, "smlalbb">; +def SMLALBT : SMLAL<0b10, "smlalbt">; +def SMLALTB : SMLAL<0b01, "smlaltb">; +def SMLALTT : SMLAL<0b11, "smlaltt">; + +def : ARMV5TEPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALBB $Rn, $Rm, $RLo, $RHi)>; +def : ARMV5TEPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALBT $Rn, $Rm, $RLo, $RHi)>; +def : ARMV5TEPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALTB $Rn, $Rm, $RLo, $RHi)>; +def : ARMV5TEPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALTT $Rn, $Rm, $RLo, $RHi)>; // Helper class for AI_smld. class AMulDualIbase; -def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb", []>; -def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt", []>; +def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb", + [(set rGPR:$Rd, (ARMsmulwb rGPR:$Rn, rGPR:$Rm))]>; +def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt", + [(set rGPR:$Rd, (ARMsmulwt rGPR:$Rn, rGPR:$Rm))]>; def : Thumb2DSPPat<(mul sext_16_node:$Rm, sext_16_node:$Rn), (t2SMULBB rGPR:$Rm, rGPR:$Rn)>; @@ -2663,8 +2665,10 @@ def t2SMLATT : T2FourRegSMLA<0b001, 0b11, "smlatt", [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)), (sra rGPR:$Rm, (i32 16)))))]>; -def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb", []>; -def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", []>; +def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb", + [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwb rGPR:$Rn, rGPR:$Rm)))]>; +def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", + [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwt rGPR:$Rn, rGPR:$Rm)))]>; def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, sext_16_node:$Rm)), (t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>; @@ -2675,18 +2679,24 @@ (mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm)), (t2SMLATB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>; -class T2SMLAL op22_20, bits<4> op7_4, string opc, list pattern> - : T2FourReg_mac<1, op22_20, op7_4, - (outs rGPR:$Ra, rGPR:$Rd), - (ins rGPR:$Rn, rGPR:$Rm), - IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasDSP]>; - // Halfword multiple accumulate long: SMLAL -def t2SMLALBB : T2SMLAL<0b100, 0b1000, "smlalbb", []>; -def t2SMLALBT : T2SMLAL<0b100, 0b1001, "smlalbt", []>; -def t2SMLALTB : T2SMLAL<0b100, 0b1010, "smlaltb", []>; -def t2SMLALTT : T2SMLAL<0b100, 0b1011, "smlaltt", []>; +def t2SMLALBB : T2MlaLong<0b100, 0b1000, "smlalbb">, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALBT : T2MlaLong<0b100, 0b1001, "smlalbt">, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALTB : T2MlaLong<0b100, 0b1010, "smlaltb">, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALTT : T2MlaLong<0b100, 0b1011, "smlaltt">, + Requires<[IsThumb2, HasDSP]>; + +def : Thumb2DSPPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALBB $Rn, $Rm, $RLo, $RHi)>; +def : Thumb2DSPPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALBT $Rn, $Rm, $RLo, $RHi)>; +def : Thumb2DSPPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALTB $Rn, $Rm, $RLo, $RHi)>; +def : Thumb2DSPPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALTT $Rn, $Rm, $RLo, $RHi)>; class T2DualHalfMul op22_20, bits<4> op7_4, string opc> : T2ThreeReg_mac<0, op22_20, op7_4, Index: test/CodeGen/ARM/longMAC.ll =================================================================== --- test/CodeGen/ARM/longMAC.ll +++ test/CodeGen/ARM/longMAC.ll @@ -3,12 +3,13 @@ ; RUN: llc -mtriple=armeb-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE ; RUN: llc -mtriple=armebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-BE ; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6-THUMB -; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6-THUMB2 -; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-THUMB +; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK-T2-DSP +; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-T2-DSP ; RUN: llc -mtriple=thumbebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-THUMB-BE ; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6M-THUMB ; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7M-THUMB -; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7EM-THUMB +; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-T2-DSP +; RUN: llc -mtriple=armv5te-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V5TE ; Check generated signed and unsigned multiply accumulate long. define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) { @@ -20,12 +21,9 @@ ;CHECK-BE: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-BE: mov r0, [[RDHI]] ;CHECK-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] +;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]] +;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]] ;CHECK-V7-THUMB-BE: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] @@ -44,12 +42,9 @@ ;CHECK-BE: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-BE: mov r0, [[RDHI]] ;CHECK-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] +;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]] +;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]] ;CHECK-V7-THUMB-BE: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] @@ -78,8 +73,7 @@ ;CHECK-BE: umlal [[RDLO:r[0-9]+]], [[RDHI]], r1, r0 ;CHECK-BE: mov r0, [[RDHI]] ;CHECK-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umlal -;CHECK-V7-THUMB: umlal +;CHECK-T2-DSP: umlal ;CHECK-V6-THUMB-NOT: umlal %conv = zext i32 %b to i64 %conv1 = zext i32 %a to i64 @@ -92,8 +86,7 @@ define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) { ;CHECK-LABEL: MACLongTest4: ;CHECK-V6-THUMB-NOT: smlal -;CHECK-V6-THUMB2: smlal -;CHECK-V7-THUMB: smlal +;CHECK-T2-DSP: smlal ;CHECK-LE: asr [[RDHI:r[0-9]+]], [[RDLO:r[0-9]+]], #31 ;CHECK-LE: smlal [[RDLO]], [[RDHI]], r1, r0 ;CHECK-LE: mov r0, [[RDLO]] @@ -118,10 +111,8 @@ ;CHECK: smlal r12, lr, r3, r2 ;CHECK-V7: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 ;CHECK-V7: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] -;CHECK-V7-THUMB: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 -;CHECK-V7-THUMB: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] -;CHECK-V6-THUMB2: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 -;CHECK-V6-THUMB2: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] +;CHECK-T2-DSP: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 +;CHECK-T2-DSP: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] %conv = sext i32 %a to i64 %conv1 = sext i32 %b to i64 %mul = mul nsw i64 %conv1, %conv @@ -172,18 +163,12 @@ ;CHECK-V7-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-BE: mov r0, [[RDHI]] ;CHECK-V7-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] +;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]] +;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]] ;CHECK-V7-THUMB-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] -;CHECK-V7EM-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7EM-THUMB: mov r0, [[RDLO]] -;CHECK-V7EM-THUMB: mov r1, [[RDHI]] ;CHECK-NOT:umaal ;CHECK-V6-THUMB-NOT: umaal ;CHECK-V6M-THUMB-NOT: umaal @@ -206,18 +191,12 @@ ;CHECK-V7-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-BE: mov r0, [[RDHI]] ;CHECK-V7-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: umaal r2, r3, r1, r0 +;CHECK-T2-DSP-NEXT: mov r0, r2 +;CHECK-T2-DSP-NEXT: mov r1, r3 ;CHECK-V7-THUMB-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] -;CHECK-V7EM-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7EM-THUMB: mov r0, [[RDLO]] -;CHECK-V7EM-THUMB: mov r1, [[RDHI]] ;CHECK-NOT:umaal ;CHECK-V6-THUMB-NOT:umaal ;CHECK-V6M-THUMB-NOT: umaal @@ -231,3 +210,137 @@ %add2 = add i64 %add, %mul ret i64 %add2 } + +define i64 @MACLongTest11(i16 %a, i16 %b, i64 %c) { +;CHECK-LABEL: MACLongTest11: +;CHECK-LE-NOT: smlalbb +;CHECK-BE-NOT: smlalbb +;CHECK-V6M-THUMB-NOT: smlalbb +;CHECK-V7M-THUMB-NOT: smlalbb +;CHECK-T2-DSP: smlalbb r3, r2, +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlalbb r2, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 + %conv = sext i16 %a to i32 + %conv1 = sext i16 %b to i32 + %mul = mul nsw i32 %conv1, %conv + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +define i64 @MACLongTest12(i16 %b, i32 %t, i64 %c) { +;CHECK-LABEL: MACLongTest12: +;CHECK-LE-NOT: smlalbt +;CHECK-BE-NOT: smlalbt +;CHECK-V6M-THUMB-NOT: smlalbt +;CHECK-V7M-THUMB-NOT: smlalbt +;CHECK-T2-DSP: smlalbt r3, r2, r0, r1 +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE: smlalbt r3, r2, r0, r1 +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlalbt r2, r3, +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 + %conv0 = sext i16 %b to i32 + %conv1 = ashr i32 %t, 16 + %mul = mul nsw i32 %conv0, %conv1 + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +define i64 @MACLongTest13(i32 %t, i16 %b, i64 %c) { +;CHECK-LABEL: MACLongTest13: +;CHECK-LE-NOT: smlaltb +;CHECK-BE-NOT: smlaltb +;CHECK-V6M-THUMB-NOT: smlaltb +;CHECK-V7M-THUMB-NOT: smlaltb +;CHECK-T2-DSP: smlaltb r3, r2, r0, r1 +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE: smlaltb r3, r2, r0, r1 +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlaltb r2, r3, r0, r1 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 + %conv0 = ashr i32 %t, 16 + %conv1= sext i16 %b to i32 + %mul = mul nsw i32 %conv0, %conv1 + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +define i64 @MACLongTest14(i32 %a, i32 %b, i64 %c) { +;CHECK-LABEL: MACLongTest14: +;CHECK-LE-NOT: smlaltt +;CHECK-BE-NOT: smlaltt +;CHECK-V6M-THUMB-NOT: smlaltt +;CHECK-V7M-THUMB-NOT: smlaltt +;CHECK-T2-DSP: smlaltt r3, r2, +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE: smlaltt r3, r2, +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlaltt r2, r3, +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 + %conv0 = ashr i32 %a, 16 + %conv1 = ashr i32 %b, 16 + %mul = mul nsw i32 %conv1, %conv0 + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +@global_b = external global i16, align 2 +;CHECK-LABEL: MACLongTest15 +;CHECK-LE-NOT: smlaltb +;CHECK-BE-NOT: smlaltb +;CHECK-V6M-THUMB-NOT: smlaltb +;CHECK-V7M-THUMB-NOT: smlaltb +;CHECK-T2-DSP: smlaltb r3, r2, r0, r1 +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE: smlaltb r3, r2, r0, r1 +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +define i64 @MACLongTest15(i32 %t, i64 %acc) { +entry: + %0 = load i16, i16* @global_b, align 2 + %conv = sext i16 %0 to i32 + %shr = ashr i32 %t, 16 + %mul = mul nsw i32 %shr, %conv + %conv1 = sext i32 %mul to i64 + %add = add nsw i64 %conv1, %acc + ret i64 %add +} + +;CHECK-LABEL: MACLongTest16 +;CHECK-LE-NOT: smlalbt +;CHECK-BE-NOT: smlalbt +;CHECK-V6M-THUMB-NOT: smlalbt +;CHECK-V7M-THUMB-NOT: smlalbt +;CHECK-T2-DSP: smlalbt r3, r2, r1, r0 +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE: smlalbt r3, r2, r1, r0 +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +define i64 @MACLongTest16(i32 %t, i64 %acc) { +entry: + %0 = load i16, i16* @global_b, align 2 + %conv = sext i16 %0 to i32 + %shr = ashr i32 %t, 16 + %mul = mul nsw i32 %conv, %shr + %conv1 = sext i32 %mul to i64 + %add = add nsw i64 %conv1, %acc + ret i64 %add +}