Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.h +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h @@ -180,6 +180,10 @@ UMLAL, // 64bit Unsigned Accumulate Multiply SMLAL, // 64bit Signed Accumulate Multiply UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply + SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16 + SMLALBT, // 64-bit signed accumulate multiply bottom, top 16 + SMLALTB, // 64-bit signed accumulate multiply top, bottom 16 + SMLALTT, // 64-bit signed accumulate multiply top, top 16 // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -1337,6 +1337,10 @@ case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; + case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; + case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; + case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; + case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; case ARMISD::SMULWB: return "ARMISD::SMULWB"; case ARMISD::SMULWT: return "ARMISD::SMULWT"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; @@ -9497,8 +9501,90 @@ return SDValue(); } +static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + if (Subtarget->isThumb()) { + if (!Subtarget->hasDSP()) + return SDValue(); + } else if (!Subtarget->hasV5TEOps()) + return SDValue(); + + // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and + // accumulates the product into a 64-bit value. The 16-bit values will + // be sign extended somehow or SRA'd into 32-bit values + // (addc (adde (mul 16bit, 16bit), lo), hi) + SDValue Mul = AddcNode->getOperand(0); + SDValue Hi = AddcNode->getOperand(1); + if (Mul.getOpcode() != ISD::MUL) { + Hi = AddcNode->getOperand(0); + Mul = AddcNode->getOperand(1); + if (Mul.getOpcode() != ISD::MUL) + return SDValue(); + } + + SDValue SRA = AddeNode->getOperand(0); + SDValue Lo = AddeNode->getOperand(1); + if (SRA.getOpcode() != ISD::SRA) { + SRA = AddeNode->getOperand(1); + Lo = AddeNode->getOperand(0); + if (SRA.getOpcode() != ISD::SRA) + return SDValue(); + } + if (auto Const = dyn_cast(SRA.getOperand(1))) { + if (Const->getZExtValue() != 31) + return SDValue(); + } else + return SDValue(); + + if (SRA.getOperand(0) != Mul) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(AddcNode); + unsigned Opcode = 0; + SDValue Op0; + SDValue Op1; + + if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { + Opcode = ARMISD::SMLALBB; + Op0 = Mul.getOperand(0); + Op1 = Mul.getOperand(1); + } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { + Opcode = ARMISD::SMLALBT; + Op0 = Mul.getOperand(0); + Op1 = Mul.getOperand(1).getOperand(0); + } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { + Opcode = ARMISD::SMLALTB; + Op0 = Mul.getOperand(0).getOperand(0); + Op1 = Mul.getOperand(1); + } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { + Opcode = ARMISD::SMLALTT; + Op0 = Mul->getOperand(0).getOperand(0); + Op1 = Mul->getOperand(1).getOperand(0); + } + + if (!Op0 || !Op1) + return SDValue(); + + SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), + Op0, Op1, Lo, Hi); + // Replace the ADDs' nodes uses by the MLA node's values. + SDValue HiMLALResult(SMLAL.getNode(), 1); + SDValue LoMLALResult(SMLAL.getNode(), 0); + + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); + + // Return original node to notify the driver to stop replacing. + SDValue resNode(AddcNode, 0); + return resNode; +} + static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { // Look for multiply add opportunities. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where // each add nodes consumes a value from ISD::UMUL_LOHI and there is @@ -9535,12 +9621,13 @@ AddcNode->getValueType(0) == MVT::i32 && "Expect ADDC with two result values. First: i32"); - // Check that the ADDC adds the low result of the S/UMUL_LOHI. + // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it + // maybe a SMLAL which multiplies two 16-bit values. if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && AddcOp0->getOpcode() != ISD::SMUL_LOHI && AddcOp1->getOpcode() != ISD::UMUL_LOHI && AddcOp1->getOpcode() != ISD::SMUL_LOHI) - return SDValue(); + return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget); // Check for the triangle shape. SDValue AddeOp0 = AddeNode->getOperand(0); @@ -9628,7 +9715,7 @@ // as the addend, and it's handled in PerformUMLALCombine. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) - return AddCombineTo64bitMLAL(AddeNode, DCI); + return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); // Check that we have a glued ADDC node. SDNode* AddcNode = AddeNode->getOperand(2).getNode(); @@ -9645,7 +9732,7 @@ UmlalNode = AddcNode->getOperand(1).getNode(); AddHi = AddcNode->getOperand(0); } else { - return AddCombineTo64bitMLAL(AddeNode, DCI); + return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); } // The ADDC should be glued to an ADDE node, which uses the same UMLAL as @@ -11894,6 +11981,42 @@ return SDValue(); break; } + case ARMISD::SMLALBB: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) + return SDValue(); + break; + } + case ARMISD::SMLALBT: { + unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); + APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); + unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); + APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) + return SDValue(); + break; + } + case ARMISD::SMLALTB: { + unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); + APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); + unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); + APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) + return SDValue(); + break; + } + case ARMISD::SMLALTT: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); + if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) + return SDValue(); + break; + } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { Index: llvm/trunk/lib/Target/ARM/ARMInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrInfo.td +++ llvm/trunk/lib/Target/ARM/ARMInstrInfo.td @@ -92,6 +92,13 @@ SDTCisVT<1, i32>, SDTCisVT<4, i32>]>; +def SDT_LongMac : SDTypeProfile<2, 4, [SDTCisVT<0, i32>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisSameAs<0, 4>, + SDTCisSameAs<0, 5>]>; + // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; @@ -185,6 +192,10 @@ def ARMsmulwb : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>; def ARMsmulwt : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>; +def ARMsmlalbb : SDNode<"ARMISD::SMLALBB", SDT_LongMac, []>; +def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>; +def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>; +def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>; //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. @@ -4183,29 +4194,28 @@ defm SMLA : AI_smla<"smla">; // Halfword multiply accumulate long: SMLAL. -def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - -def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - -def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - -def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), - (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV5TE]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; +class SMLAL opc1, string asm> + : AMulxyI64<0b0001010, opc1, + (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi), + IIC_iMAC64, asm, "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, + Requires<[IsARM, HasV5TE]>, + Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; + +def SMLALBB : SMLAL<0b00, "smlalbb">; +def SMLALBT : SMLAL<0b10, "smlalbt">; +def SMLALTB : SMLAL<0b01, "smlaltb">; +def SMLALTT : SMLAL<0b11, "smlaltt">; + +def : ARMV5TEPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALBB $Rn, $Rm, $RLo, $RHi)>; +def : ARMV5TEPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALBT $Rn, $Rm, $RLo, $RHi)>; +def : ARMV5TEPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALTB $Rn, $Rm, $RLo, $RHi)>; +def : ARMV5TEPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (SMLALTT $Rn, $Rm, $RLo, $RHi)>; // Helper class for AI_smld. class AMulDualIbase; -class T2SMLAL op22_20, bits<4> op7_4, string opc, list pattern> - : T2FourReg_mac<1, op22_20, op7_4, - (outs rGPR:$Ra, rGPR:$Rd), - (ins rGPR:$Rn, rGPR:$Rm), - IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasDSP]>, - Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>; - // Halfword multiple accumulate long: SMLAL -def t2SMLALBB : T2SMLAL<0b100, 0b1000, "smlalbb", []>; -def t2SMLALBT : T2SMLAL<0b100, 0b1001, "smlalbt", []>; -def t2SMLALTB : T2SMLAL<0b100, 0b1010, "smlaltb", []>; -def t2SMLALTT : T2SMLAL<0b100, 0b1011, "smlaltt", []>; +def t2SMLALBB : T2MlaLong<0b100, 0b1000, "smlalbb">, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALBT : T2MlaLong<0b100, 0b1001, "smlalbt">, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALTB : T2MlaLong<0b100, 0b1010, "smlaltb">, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALTT : T2MlaLong<0b100, 0b1011, "smlaltt">, + Requires<[IsThumb2, HasDSP]>; + +def : Thumb2DSPPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALBB $Rn, $Rm, $RLo, $RHi)>; +def : Thumb2DSPPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALBT $Rn, $Rm, $RLo, $RHi)>; +def : Thumb2DSPPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALTB $Rn, $Rm, $RLo, $RHi)>; +def : Thumb2DSPPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + (t2SMLALTT $Rn, $Rm, $RLo, $RHi)>; class T2DualHalfMul op22_20, bits<4> op7_4, string opc> : T2ThreeReg_mac<0, op22_20, op7_4, Index: llvm/trunk/test/CodeGen/ARM/longMAC.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/longMAC.ll +++ llvm/trunk/test/CodeGen/ARM/longMAC.ll @@ -3,12 +3,13 @@ ; RUN: llc -mtriple=armeb-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE ; RUN: llc -mtriple=armebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-BE ; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6-THUMB -; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6-THUMB2 -; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-THUMB +; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK-T2-DSP +; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-T2-DSP ; RUN: llc -mtriple=thumbebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-THUMB-BE ; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6M-THUMB ; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7M-THUMB -; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7EM-THUMB +; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-T2-DSP +; RUN: llc -mtriple=armv5te-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V5TE ; Check generated signed and unsigned multiply accumulate long. define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) { @@ -20,12 +21,9 @@ ;CHECK-BE: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-BE: mov r0, [[RDHI]] ;CHECK-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] +;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]] +;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]] ;CHECK-V7-THUMB-BE: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] @@ -44,12 +42,9 @@ ;CHECK-BE: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-BE: mov r0, [[RDHI]] ;CHECK-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] +;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]] +;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]] ;CHECK-V7-THUMB-BE: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] @@ -78,8 +73,7 @@ ;CHECK-BE: umlal [[RDLO:r[0-9]+]], [[RDHI]], r1, r0 ;CHECK-BE: mov r0, [[RDHI]] ;CHECK-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umlal -;CHECK-V7-THUMB: umlal +;CHECK-T2-DSP: umlal ;CHECK-V6-THUMB-NOT: umlal %conv = zext i32 %b to i64 %conv1 = zext i32 %a to i64 @@ -92,8 +86,7 @@ define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) { ;CHECK-LABEL: MACLongTest4: ;CHECK-V6-THUMB-NOT: smlal -;CHECK-V6-THUMB2: smlal -;CHECK-V7-THUMB: smlal +;CHECK-T2-DSP: smlal ;CHECK-LE: asr [[RDHI:r[0-9]+]], [[RDLO:r[0-9]+]], #31 ;CHECK-LE: smlal [[RDLO]], [[RDHI]], r1, r0 ;CHECK-LE: mov r0, [[RDLO]] @@ -118,10 +111,8 @@ ;CHECK: smlal r12, lr, r3, r2 ;CHECK-V7: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 ;CHECK-V7: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] -;CHECK-V7-THUMB: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 -;CHECK-V7-THUMB: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] -;CHECK-V6-THUMB2: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 -;CHECK-V6-THUMB2: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] +;CHECK-T2-DSP: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 +;CHECK-T2-DSP: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] %conv = sext i32 %a to i64 %conv1 = sext i32 %b to i64 %mul = mul nsw i64 %conv1, %conv @@ -172,18 +163,12 @@ ;CHECK-V7-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-BE: mov r0, [[RDHI]] ;CHECK-V7-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] +;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]] +;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]] ;CHECK-V7-THUMB-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] -;CHECK-V7EM-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7EM-THUMB: mov r0, [[RDLO]] -;CHECK-V7EM-THUMB: mov r1, [[RDHI]] ;CHECK-NOT:umaal ;CHECK-V6-THUMB-NOT: umaal ;CHECK-V6M-THUMB-NOT: umaal @@ -206,18 +191,12 @@ ;CHECK-V7-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-BE: mov r0, [[RDHI]] ;CHECK-V7-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: umaal r2, r3, r1, r0 +;CHECK-T2-DSP-NEXT: mov r0, r2 +;CHECK-T2-DSP-NEXT: mov r1, r3 ;CHECK-V7-THUMB-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] -;CHECK-V7EM-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7EM-THUMB: mov r0, [[RDLO]] -;CHECK-V7EM-THUMB: mov r1, [[RDHI]] ;CHECK-NOT:umaal ;CHECK-V6-THUMB-NOT:umaal ;CHECK-V6M-THUMB-NOT: umaal @@ -231,3 +210,188 @@ %add2 = add i64 %add, %mul ret i64 %add2 } + +define i64 @MACLongTest11(i16 %a, i16 %b, i64 %c) { +;CHECK-LABEL: MACLongTest11: +;CHECK-T2-DSP-NOT: sxth +;CHECK-T2-DSP: smlalbb r3, r2 +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE-NOT: sxth +;CHECK-V5TE: smlalbb r3, r2 +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +;CHECK-V7-LE-NOT: sxth +;CHECK-V7-LE: smlalbb r3, r2 +;CHECK-V7-LE-NEXT: mov r0, r3 +;CHECK-V7-LE-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlalbb r2, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 +;CHECK-LE-NOT: smlalbb +;CHECK-BE-NOT: smlalbb +;CHECK-V6M-THUMB-NOT: smlalbb +;CHECK-V7M-THUMB-NOT: smlalbb + %conv = sext i16 %a to i32 + %conv1 = sext i16 %b to i32 + %mul = mul nsw i32 %conv1, %conv + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +define i64 @MACLongTest12(i16 %b, i32 %t, i64 %c) { +;CHECK-LABEL: MACLongTest12: +;CHECK-T2-DSP-NOT: sxth +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlalbt r3, r2, r0, r1 +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-T2-DSP-NOT: sxth +;CHECK-V5TE-NOT: sxth +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlalbt r3, r2, r0, r1 +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +;CHECK-V7-LE-NOT: sxth +;CHECK-V7-LE-NOT: {{asr|lsr}} +;CHECK-V7-LE: smlalbt r3, r2, r0, r1 +;CHECK-V7-LE-NEXT: mov r0, r3 +;CHECK-V7-LE-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlalbt r2, r3, +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 +;CHECK-LE-NOT: smlalbt +;CHECK-BE-NOT: smlalbt +;CHECK-V6M-THUMB-NOT: smlalbt +;CHECK-V7M-THUMB-NOT: smlalbt + %conv0 = sext i16 %b to i32 + %conv1 = ashr i32 %t, 16 + %mul = mul nsw i32 %conv0, %conv1 + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +define i64 @MACLongTest13(i32 %t, i16 %b, i64 %c) { +;CHECK-LABEL: MACLongTest13: +;CHECK-T2-DSP-NOT: sxth +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlaltb r3, r2, r0, r1 +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE-NOT: sxth +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlaltb r3, r2, r0, r1 +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +;CHECK-V7-LE-NOT: sxth +;CHECK-V7-LE-NOT: {{asr|lsr}} +;CHECK-V7-LE: smlaltb r3, r2, r0, r1 +;CHECK-V7-LE-NEXT: mov r0, r3 +;CHECK-V7-LE-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlaltb r2, r3, r0, r1 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 +;CHECK-LE-NOT: smlaltb +;CHECK-BE-NOT: smlaltb +;CHECK-V6M-THUMB-NOT: smlaltb +;CHECK-V7M-THUMB-NOT: smlaltb + %conv0 = ashr i32 %t, 16 + %conv1= sext i16 %b to i32 + %mul = mul nsw i32 %conv0, %conv1 + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +define i64 @MACLongTest14(i32 %a, i32 %b, i64 %c) { +;CHECK-LABEL: MACLongTest14: +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlaltt r3, r2, +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlaltt r3, r2, +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +;CHECK-V7-LE-NOT: {{asr|lsr}} +;CHECK-V7-LE: smlaltt r3, r2, +;CHECK-V7-LE-NEXT: mov r0, r3 +;CHECK-V7-LE-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlaltt r2, r3, +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 +;CHECK-LE-NOT: smlaltt +;CHECK-BE-NOT: smlaltt +;CHECK-V6M-THUMB-NOT: smlaltt +;CHECK-V7M-THUMB-NOT: smlaltt + %conv0 = ashr i32 %a, 16 + %conv1 = ashr i32 %b, 16 + %mul = mul nsw i32 %conv1, %conv0 + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +@global_b = external global i16, align 2 +;CHECK-LABEL: MACLongTest15 +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlaltb r3, r2, r0, r1 +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlaltb r3, r2, r0, r1 +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +;CHECK-V7-LE-NOT: {{asr|lsr}} +;CHECK-V7-LE: smlaltb r3, r2, r0, r1 +;CHECK-V7-LE-NEXT: mov r0, r3 +;CHECK-V7-LE-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlaltb r2, r3, r0, r1 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 +;CHECK-LE-NOT: smlaltb +;CHECK-BE-NOT: smlaltb +;CHECK-V6M-THUMB-NOT: smlaltb +;CHECK-V7M-THUMB-NOT: smlaltb +define i64 @MACLongTest15(i32 %t, i64 %acc) { +entry: + %0 = load i16, i16* @global_b, align 2 + %conv = sext i16 %0 to i32 + %shr = ashr i32 %t, 16 + %mul = mul nsw i32 %shr, %conv + %conv1 = sext i32 %mul to i64 + %add = add nsw i64 %conv1, %acc + ret i64 %add +} + +;CHECK-LABEL: MACLongTest16 +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlalbt r3, r2, r1, r0 +;CHECK-T2-DSP-NEXT: mov r0, r3 +;CHECK-T2-DSP-NEXT: mov r1, r2 +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlalbt r3, r2, r1, r0 +;CHECK-V5TE-NEXT: mov r0, r3 +;CHECK-V5TE-NEXT: mov r1, r2 +;CHECK-V7-LE: smlalbt r3, r2, r1, r0 +;CHECK-V7-LE-NEXT: mov r0, r3 +;CHECK-V7-LE-NEXT: mov r1, r2 +;CHECK-V7-THUMB-BE: smlalbt r2, r3, r1, r0 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r3 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r2 +;CHECK-LE-NOT: smlalbt +;CHECK-BE-NOT: smlalbt +;CHECK-V6M-THUMB-NOT: smlalbt +;CHECK-V7M-THUMB-NOT: smlalbt +define i64 @MACLongTest16(i32 %t, i64 %acc) { +entry: + %0 = load i16, i16* @global_b, align 2 + %conv = sext i16 %0 to i32 + %shr = ashr i32 %t, 16 + %mul = mul nsw i32 %conv, %shr + %conv1 = sext i32 %mul to i64 + %add = add nsw i64 %conv1, %acc + ret i64 %add +}