Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.h +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h @@ -203,6 +203,8 @@ SMLALDX, // Signed multiply accumulate long dual exchange SMLSLD, // Signed multiply subtract long dual SMLSLDX, // Signed multiply subtract long dual exchange + SMMLAR, // Signed multiply long, round and add + SMMLSR, // Signed multiply long, subtract and round // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -1337,6 +1337,8 @@ case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; + case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; + case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; @@ -9860,7 +9862,7 @@ return resNode; } -static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode, +static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Look for multiply add opportunities. @@ -9877,49 +9879,61 @@ // V V // ADDE <- hiAdd // - assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE"); + // In the special case where only the higher part of a signed result is used + // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts + // a constant with the exact value of 0x80000000, we recognize we are dealing + // with a "rounded multiply and add" (or subtract) and transform it into + // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. + + assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || + AddeSubeNode->getOpcode() == ARMISD::SUBE) && + "Expect an ADDE or SUBE"); - assert(AddeNode->getNumOperands() == 3 && - AddeNode->getOperand(2).getValueType() == MVT::i32 && + assert(AddeSubeNode->getNumOperands() == 3 && + AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && "ADDE node has the wrong inputs"); - // Check that we are chained to the right ADDC node. - SDNode* AddcNode = AddeNode->getOperand(2).getNode(); - if (AddcNode->getOpcode() != ARMISD::ADDC) + // Check that we are chained to the right ADDC or SUBC node. + SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); + if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && + AddcSubcNode->getOpcode() != ARMISD::ADDC) || + (AddeSubeNode->getOpcode() == ARMISD::SUBE && + AddcSubcNode->getOpcode() != ARMISD::SUBC)) return SDValue(); - SDValue AddcOp0 = AddcNode->getOperand(0); - SDValue AddcOp1 = AddcNode->getOperand(1); + SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); + SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); // Check if the two operands are from the same mul_lohi node. - if (AddcOp0.getNode() == AddcOp1.getNode()) + if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) return SDValue(); - assert(AddcNode->getNumValues() == 2 && - AddcNode->getValueType(0) == MVT::i32 && + assert(AddcSubcNode->getNumValues() == 2 && + AddcSubcNode->getValueType(0) == MVT::i32 && "Expect ADDC with two result values. First: i32"); // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it // maybe a SMLAL which multiplies two 16-bit values. - if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && - AddcOp0->getOpcode() != ISD::SMUL_LOHI && - AddcOp1->getOpcode() != ISD::UMUL_LOHI && - AddcOp1->getOpcode() != ISD::SMUL_LOHI) - return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget); + if (AddeSubeNode->getOpcode() == ARMISD::ADDE && + AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && + AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && + AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && + AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) + return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); // Check for the triangle shape. - SDValue AddeOp0 = AddeNode->getOperand(0); - SDValue AddeOp1 = AddeNode->getOperand(1); + SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); + SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); - // Make sure that the ADDE operands are not coming from the same node. - if (AddeOp0.getNode() == AddeOp1.getNode()) + // Make sure that the ADDE/SUBE operands are not coming from the same node. + if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) return SDValue(); - // Find the MUL_LOHI node walking up ADDE's operands. + // Find the MUL_LOHI node walking up ADDE/SUBE's operands. bool IsLeftOperandMUL = false; - SDValue MULOp = findMUL_LOHI(AddeOp0); + SDValue MULOp = findMUL_LOHI(AddeSubeOp0); if (MULOp == SDValue()) - MULOp = findMUL_LOHI(AddeOp1); + MULOp = findMUL_LOHI(AddeSubeOp1); else IsLeftOperandMUL = true; if (MULOp == SDValue()) @@ -9930,63 +9944,88 @@ unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; // Figure out the high and low input values to the MLAL node. - SDValue* HiAdd = nullptr; - SDValue* LoMul = nullptr; - SDValue* LowAdd = nullptr; + SDValue *HiAddSub = nullptr; + SDValue *LoMul = nullptr; + SDValue *LowAddSub = nullptr; - // Ensure that ADDE is from high result of ISD::xMUL_LOHI. - if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1))) + // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. + if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) return SDValue(); if (IsLeftOperandMUL) - HiAdd = &AddeOp1; + HiAddSub = &AddeSubeOp1; else - HiAdd = &AddeOp0; + HiAddSub = &AddeSubeOp0; + // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node + // whose low result is fed to the ADDC/SUBC we are checking. - // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node - // whose low result is fed to the ADDC we are checking. - - if (AddcOp0 == MULOp.getValue(0)) { - LoMul = &AddcOp0; - LowAdd = &AddcOp1; - } - if (AddcOp1 == MULOp.getValue(0)) { - LoMul = &AddcOp1; - LowAdd = &AddcOp0; + if (AddcSubcOp0 == MULOp.getValue(0)) { + LoMul = &AddcSubcOp0; + LowAddSub = &AddcSubcOp1; + } + if (AddcSubcOp1 == MULOp.getValue(0)) { + LoMul = &AddcSubcOp1; + LowAddSub = &AddcSubcOp0; } if (!LoMul) return SDValue(); - // If HiAdd is the same node as ADDC or is a predecessor of ADDC the - // replacement below will create a cycle. - if (AddcNode == HiAdd->getNode() || - AddcNode->isPredecessorOf(HiAdd->getNode())) + // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC + // the replacement below will create a cycle. + if (AddcSubcNode == HiAddSub->getNode() || + AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) return SDValue(); // Create the merged node. SelectionDAG &DAG = DCI.DAG; - // Build operand list. + // Start building operand list. SmallVector Ops; Ops.push_back(LoMul->getOperand(0)); Ops.push_back(LoMul->getOperand(1)); - Ops.push_back(*LowAdd); - Ops.push_back(*HiAdd); - SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode), + // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be + // the case, we must be doing signed multiplication and only use the higher + // part of the result of the MLAL, furthermore the LowAddSub must be a constant + // addition or subtraction with the value of 0x800000. + if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && + FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && + LowAddSub->getNode()->getOpcode() == ISD::Constant && + static_cast(LowAddSub->getNode())->getZExtValue() == + 0x80000000) { + Ops.push_back(*HiAddSub); + if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { + FinalOpc = ARMISD::SMMLSR; + } else { + FinalOpc = ARMISD::SMMLAR; + } + SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); + + return SDValue(AddeSubeNode, 0); + } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) + // SMMLS is generated during instruction selection and the rest of this + // function can not handle the case where AddcSubcNode is a SUBC. + return SDValue(); + + // Finish building the operand list for {U/S}MLAL + Ops.push_back(*LowAddSub); + Ops.push_back(*HiAddSub); + + SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), DAG.getVTList(MVT::i32, MVT::i32), Ops); // Replace the ADDs' nodes uses by the MLA node's values. SDValue HiMLALResult(MLALNode.getNode(), 1); - DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); SDValue LoMLALResult(MLALNode.getNode(), 0); - DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); // Return original node to notify the driver to stop replacing. - return SDValue(AddeNode, 0); + return SDValue(AddeSubeNode, 0); } static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, @@ -10098,9 +10137,11 @@ return SDValue(); } -static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG, +static SDValue PerformAddeSubeCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (Subtarget->isThumb1Only()) { + SelectionDAG &DAG = DCI.DAG; SDValue RHS = N->getOperand(1); if (ConstantSDNode *C = dyn_cast(RHS)) { int64_t imm = C->getSExtValue(); @@ -10118,6 +10159,8 @@ N->getOperand(0), RHS, N->getOperand(2)); } } + } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { + return AddCombineTo64bitMLAL(N, DCI, Subtarget); } return SDValue(); } @@ -10130,7 +10173,7 @@ const ARMSubtarget *Subtarget) { // Only ARM and Thumb2 support UMLAL/SMLAL. if (Subtarget->isThumb1Only()) - return PerformAddeSubeCombine(N, DCI.DAG, Subtarget); + return PerformAddeSubeCombine(N, DCI, Subtarget); // Only perform the checks after legalize when the pattern is available. if (DCI.isBeforeLegalize()) return SDValue(); @@ -12338,7 +12381,7 @@ case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); - case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI.DAG, Subtarget); + case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); Index: llvm/trunk/lib/Target/ARM/ARMInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrInfo.td +++ llvm/trunk/lib/Target/ARM/ARMInstrInfo.td @@ -105,6 +105,14 @@ def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>; def ARMSmlsldx : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>; +def SDT_MulHSR : SDTypeProfile<1, 3, [SDTCisVT<0,i32>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; + +def ARMsmmlar : SDNode<"ARMISD::SMMLAR", SDT_MulHSR>; +def ARMsmmlsr : SDNode<"ARMISD::SMMLSR", SDT_MulHSR>; + // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; @@ -4143,7 +4151,8 @@ } def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), - IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>, + IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, (i32 0)))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMUL32, ReadMUL, ReadMUL]> { let Inst{15-12} = 0b1111; @@ -4158,7 +4167,8 @@ def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), - IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>, + IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (ARMsmmlar GPR:$Rn, GPR:$Rm, GPR:$Ra))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>; @@ -4170,7 +4180,8 @@ def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), - IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>, + IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (ARMsmmlsr GPR:$Rn, GPR:$Rm, GPR:$Ra))]>, Requires<[IsARM, HasV6]>, Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>; Index: llvm/trunk/lib/Target/ARM/ARMInstrThumb2.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrThumb2.td +++ llvm/trunk/lib/Target/ARM/ARMInstrThumb2.td @@ -2661,7 +2661,9 @@ } def t2SMMUL : T2SMMUL<0b0000, "smmul", [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]>; -def t2SMMULR : T2SMMUL<0b0001, "smmulr", []>; +def t2SMMULR : + T2SMMUL<0b0001, "smmulr", + [(set rGPR:$Rd, (ARMsmmlar rGPR:$Rn, rGPR:$Rm, (i32 0)))]>; class T2FourRegSMMLA op22_20, bits<4> op7_4, string opc, list pattern> @@ -2677,9 +2679,11 @@ def t2SMMLA : T2FourRegSMMLA<0b101, 0b0000, "smmla", [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>; -def t2SMMLAR: T2FourRegSMMLA<0b101, 0b0001, "smmlar", []>; +def t2SMMLAR: T2FourRegSMMLA<0b101, 0b0001, "smmlar", + [(set rGPR:$Rd, (ARMsmmlar rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>; def t2SMMLS: T2FourRegSMMLA<0b110, 0b0000, "smmls", []>; -def t2SMMLSR: T2FourRegSMMLA<0b110, 0b0001, "smmlsr", []>; +def t2SMMLSR: T2FourRegSMMLA<0b110, 0b0001, "smmlsr", + [(set rGPR:$Rd, (ARMsmmlsr rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>; class T2ThreeRegSMUL op22_20, bits<2> op5_4, string opc, list pattern> Index: llvm/trunk/test/CodeGen/ARM/dsp-mlal.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/dsp-mlal.ll +++ llvm/trunk/test/CodeGen/ARM/dsp-mlal.ll @@ -0,0 +1,171 @@ +; RUN: llc -mtriple=thumbv7m -mattr=+dsp %s -o - | FileCheck %s +; RUN: llc -mtriple=armv7a %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7m -mattr=-dsp %s -o - | FileCheck --check-prefix=NODSP %s + +define hidden i32 @SMMULR_SMMLAR(i32 %a, i32 %b0, i32 %b1, i32 %Xn, i32 %Xn1) local_unnamed_addr { +entry: +; CHECK-LABEL: SMMULR_SMMLAR: +; CHECK: ldr r0, [sp] +; CHECK-NEXT: smmulr r0, {{(r0, r2|r2, r0)}} +; CHECK-NEXT: smmlar r0, {{(r1, r3|r3, r1)}}, r0 +; NODSP-LABEL: SMMULR_SMMLAR: +; NODSP-NOT: smmulr +; NODSP-NOT: smmlar + %conv = sext i32 %b1 to i64 + %conv1 = sext i32 %Xn1 to i64 + %mul = mul nsw i64 %conv1, %conv + %add = add nsw i64 %mul, 2147483648 + %0 = and i64 %add, -4294967296 + %conv4 = sext i32 %b0 to i64 + %conv5 = sext i32 %Xn to i64 + %mul6 = mul nsw i64 %conv5, %conv4 + %add7 = add i64 %mul6, 2147483648 + %add8 = add i64 %add7, %0 + %1 = lshr i64 %add8, 32 + %conv10 = trunc i64 %1 to i32 + ret i32 %conv10 +} + +define hidden i32 @SMMULR(i32 %a, i32 %b) local_unnamed_addr { +entry: +; CHECK-LABEL: SMMULR: +; CHECK: smmulr r0, {{(r0, r1|r1, r0)}} +; NODSP-LABEL: SMMULR: +; NODSP-NOT: smmulr + %conv = sext i32 %a to i64 + %conv1 = sext i32 %b to i64 + %mul = mul nsw i64 %conv1, %conv + %add = add nsw i64 %mul, 2147483648 + %0 = lshr i64 %add, 32 + %conv2 = trunc i64 %0 to i32 + ret i32 %conv2 +} + +define hidden i32 @SMMUL(i32 %a, i32 %b) local_unnamed_addr { +entry: +; CHECK-LABEL: SMMUL: +; CHECK: smmul r0, {{(r0, r1|r1, r0)}} +; NODSP-LABEL: SMMUL: +; NODSP-NOT: smmul + %conv = sext i32 %a to i64 + %conv1 = sext i32 %b to i64 + %mul = mul nsw i64 %conv1, %conv + %0 = lshr i64 %mul, 32 + %conv2 = trunc i64 %0 to i32 + ret i32 %conv2 +} + +define hidden i32 @SMMLSR(i32 %a, i32 %b, i32 %c) local_unnamed_addr { +entry: +; CHECK-LABEL: SMMLSR: +; CHECK: smmlsr r0, {{(r1, r2|r2, r1)}}, r0 +; NODSP-LABEL: SMMLSR: +; NODSP-NOT: smmlsr + %conv6 = zext i32 %a to i64 + %shl = shl nuw i64 %conv6, 32 + %conv1 = sext i32 %b to i64 + %conv2 = sext i32 %c to i64 + %mul = mul nsw i64 %conv2, %conv1 + %sub = or i64 %shl, 2147483648 + %add = sub i64 %sub, %mul + %0 = lshr i64 %add, 32 + %conv3 = trunc i64 %0 to i32 + ret i32 %conv3 +} + +define hidden i32 @NOT_SMMLSR(i32 %a, i32 %b, i32 %c) local_unnamed_addr { +entry: +; CHECK-LABEL: NOT_SMMLSR: +; CHECK-NOT: smmlsr +; NODSP-LABEL: NOT_SMMLSR: +; NODSP-NOT: smmlsr + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %add = add nsw i64 %mul, 2147483648 + %0 = lshr i64 %add, 32 + %conv2 = trunc i64 %0 to i32 + %sub = sub nsw i32 %a, %conv2 + ret i32 %sub +} + +define hidden i32 @SMMLS(i32 %a, i32 %b, i32 %c) local_unnamed_addr { +entry: +; CHECK-LABEL: SMMLS: +; CHECK: smmls r0, {{(r1, r2|r2, r1)}}, r0 +; NODSP-LABEL: SMMLS: +; NODSP-NOT: smmls + %conv5 = zext i32 %a to i64 + %shl = shl nuw i64 %conv5, 32 + %conv1 = sext i32 %b to i64 + %conv2 = sext i32 %c to i64 + %mul = mul nsw i64 %conv2, %conv1 + %sub = sub nsw i64 %shl, %mul + %0 = lshr i64 %sub, 32 + %conv3 = trunc i64 %0 to i32 + ret i32 %conv3 +} + +define hidden i32 @NOT_SMMLS(i32 %a, i32 %b, i32 %c) local_unnamed_addr { +entry: +; CHECK-LABEL: NOT_SMMLS: +; CHECK-NOT: smmls +; NODSP-LABEL: NOT_SMMLS: +; NODSP-NOT: smmls + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %0 = lshr i64 %mul, 32 + %conv2 = trunc i64 %0 to i32 + %sub = sub nsw i32 %a, %conv2 + ret i32 %sub +} + +define hidden i32 @SMMLA(i32 %a, i32 %b, i32 %c) local_unnamed_addr { +entry: +; CHECK-LABEL: SMMLA: +; CHECK: smmla r0, {{(r1, r2|r2, r1)}}, r0 +; NODSP-LABEL: SMMLA: +; NODSP-NOT: smmla + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %0 = lshr i64 %mul, 32 + %conv2 = trunc i64 %0 to i32 + %add = add nsw i32 %conv2, %a + ret i32 %add +} + +define hidden i32 @SMMLAR(i32 %a, i32 %b, i32 %c) local_unnamed_addr { +entry: +; CHECK-LABEL: SMMLAR: +; CHECK: smmlar r0, {{(r1, r2|r2, r1)}}, r0 +; NODSP-LABEL: SMMLAR: +; NODSP-NOT: smmlar + %conv7 = zext i32 %a to i64 + %shl = shl nuw i64 %conv7, 32 + %conv1 = sext i32 %b to i64 + %conv2 = sext i32 %c to i64 + %mul = mul nsw i64 %conv2, %conv1 + %add = or i64 %shl, 2147483648 + %add3 = add i64 %add, %mul + %0 = lshr i64 %add3, 32 + %conv4 = trunc i64 %0 to i32 + ret i32 %conv4 +} + +define hidden i32 @NOT_SMMLA(i32 %a, i32 %b, i32 %c) local_unnamed_addr { +entry: +; CHECK-LABEL: NOT_SMMLA: +; CHECK-NOT: smmla +; NODSP-LABEL: NOT_SMMLA: +; NODSP-NOT: smmla + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %0 = lshr i64 %mul, 32 + %conv2 = trunc i64 %0 to i32 + %add = xor i32 %conv2, -2147483648 + %add3 = add i32 %add, %a + ret i32 %add3 +}