Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -286,6 +286,8 @@ /// Replace N with M in CurDAG, in a way that also ensures that M gets /// selected when N would have been selected. void replaceDAGValue(const SDValue &N, SDValue M); + + bool canUseVMLxForwarding(const SDNode &N) const; }; } @@ -418,9 +420,54 @@ } } +/// Checks if VMLS/VMLA are used instead of N they can be issued back to back +/// to the previous instruction. E.g., Cortex-A9 NEON MPE uses a special multiplier +/// accumulator forwarding if a multiply-accumulate follows a multiply or +/// another multiply-accumulate, and depends on the result of that first +/// instruction, and the dependency between both instructions are of the +/// same type and size. +bool ARMDAGToDAGISel::canUseVMLxForwarding(const SDNode& N) const { + if (!Subtarget->hasVMLxForwarding()) + return false; + + switch (N.getOpcode()) { + case ISD::FMA: + break; + + case ISD::FSUB: + case ISD::FADD: { + if (N.getOperand(1).getOpcode() != ISD::FMUL) + return false; + break; + } + + default: + return false; + } + + const SDValue &Acc = N.getOperand(0); + if (Acc.getOpcode() != ISD::FMUL && Acc.getOpcode() != ISD::FMA) + return false; + + if (N.getNumValues() != 1) + return false; + + if (Acc.getValueType() != N.getValueType(0)) + return false; + + if (!Acc.getValueType().isSimple()) + return false; + + if (!Acc.getValueType().isFloatingPoint() + || Acc.getValueType().isVector()) + return false; + + return true; +} + /// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS /// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at -/// least on current ARM implementations) which should be avoidded. +/// least on current ARM implementations) which should be avoided. bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (OptLevel == CodeGenOpt::None) return true; @@ -428,6 +475,9 @@ if (!Subtarget->hasVMLxHazards()) return true; + if (canUseVMLxForwarding(*N)) + return true; + if (!N->hasOneUse()) return false; Index: lib/Target/ARM/MLxExpansionPass.cpp =================================================================== --- lib/Target/ARM/MLxExpansionPass.cpp +++ lib/Target/ARM/MLxExpansionPass.cpp @@ -57,13 +57,16 @@ unsigned MIIdx; MachineInstr* LastMIs[4]; SmallPtrSet IgnoreStall; + SmallPtrSet AccForwarding; void clearStack(); void pushStack(MachineInstr *MI); MachineInstr *getAccDefMI(MachineInstr *MI) const; unsigned getDefReg(MachineInstr *MI) const; bool hasLoopHazard(MachineInstr *MI) const; - bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const; + bool hasRAWHazard(MachineInstr *MI, MachineInstr *NextMI) const; + bool canUseVMLxForwarding(MachineInstr *MI, + MachineInstr *AccDef) const; bool FindMLxHazard(MachineInstr *MI); void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, unsigned MulOpc, unsigned AddSubOpc, @@ -182,17 +185,21 @@ return DefMI == MI; } -bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const { +bool MLxExpansion::hasRAWHazard(MachineInstr *MI, MachineInstr *NextMI) const { + unsigned Reg = getDefReg(MI); // FIXME: Detect integer instructions properly. - const MCInstrDesc &MCID = MI->getDesc(); + const MCInstrDesc &MCID = NextMI->getDesc(); unsigned Domain = MCID.TSFlags & ARMII::DomainMask; - if (MI->mayStore()) + if (NextMI->mayStore()) return false; unsigned Opcode = MCID.getOpcode(); if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) return false; - if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON)) - return MI->readsRegister(Reg, TRI); + if (Domain & ARMII::DomainNEON) + return NextMI->readsRegister(Reg, TRI); + else if (Domain & ARMII::DomainVFP) + return NextMI->readsRegister(Reg, TRI) && !AccForwarding.count(MI); + return false; } @@ -210,6 +217,49 @@ } } +/// Checks VMLS/VMLA can be issued back to back to the previous instruction. +/// E.g., Cortex-A9 NEON MPE uses a special multiplier +/// accumulator forwarding if a multiply-accumulate follows a multiply or +/// another multiply-accumulate, and depends on the result of that first +/// instruction, and the dependency between both instructions are of the +/// same type and size. +bool MLxExpansion::canUseVMLxForwarding(MachineInstr* MI, + MachineInstr *AccDef) const { + assert(MI); + assert(AccDef); + assert(TII->isFpMLxInstruction(MI->getOpcode())); + assert(MI->getParent()); + assert(MI->getParent()->getParent()); + + if (!MI->getParent()->getParent() + ->getSubtarget().hasVMLxForwarding()) + return false; + + const unsigned AccDefOpcode = AccDef->getOpcode(); + switch (MI->getOpcode()) { + default: + return false; + + case ARM::VMLAS: + case ARM::VMLSS: + if (AccDefOpcode != ARM::VMLAS && AccDefOpcode != ARM::VMLSS && + AccDefOpcode != ARM::VMULS) { + return false; + } + break; + + case ARM::VMLAD: + case ARM::VMLSD: + if (AccDefOpcode != ARM::VMLAD && AccDefOpcode != ARM::VMLSD && + AccDefOpcode != ARM::VMULD) { + return false; + } + break; + } + + return true; +} + bool MLxExpansion::FindMLxHazard(MachineInstr *MI) { if (NumExpand >= ExpandLimit) return false; @@ -218,6 +268,12 @@ return true; MachineInstr *DefMI = getAccDefMI(MI); + + if (canUseVMLxForwarding(MI, DefMI)) { + AccForwarding.insert(DefMI); + return false; + } + if (TII->isFpMLxInstruction(DefMI->getOpcode())) { // r0 = vmla // r3 = vmla r0, r1, r2 @@ -259,7 +315,7 @@ } // Look for VMLx RAW hazard. - if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI)) + if (i <= Limit2 && hasRAWHazard(MI, NextMI)) return true; } @@ -330,6 +386,7 @@ clearStack(); IgnoreStall.clear(); + AccForwarding.clear(); unsigned Skip = 0; MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend(); Index: test/CodeGen/ARM/fmacs.ll =================================================================== --- test/CodeGen/ARM/fmacs.ll +++ test/CodeGen/ARM/fmacs.ll @@ -89,13 +89,11 @@ ; A9-LABEL: t5: ; A9: vmla.f32 -; A9: vmul.f32 -; A9: vadd.f32 +; A9: vmla.f32 ; HARD-LABEL: t5: ; HARD: vmla.f32 s4, s0, s1 -; HARD: vmul.f32 s0, s2, s3 -; HARD: vadd.f32 s0, s4, s0 +; HARD: vmla.f32 s4, s2, s3 %0 = fmul float %a, %b %1 = fadd float %e, %0 %2 = fmul float %c, %d Index: test/CodeGen/ARM/fml.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/fml.ll @@ -0,0 +1,64 @@ +; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=FPSCALAR +; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=FPVECTOR + +define double @test1(double %a, double %b, double %c, double %d, double %e, double %f) { + %1 = fmul double %a, %c + %2 = fmul double %b, %d + %3 = fsub double %1, %2 +; FPSCALAR: vmls.f64 {{.*}} + + %4 = fmul double %a, %d + %5 = fmul double %b, %c + %6 = fadd double %5, %4 +; FPSCALAR: vmla.f64 {{.*}} + + %7 = fsub double %e, %3 + %8 = fsub double %f, %6 + %9 = fadd double %3, %8 + %10 = fadd double %6, %7 + %11 = fmul double %9, %10 + + ret double %11 +} + +define float @test2(float %a, float %b, float %c, float %d, float %e, float %f) { + %1 = fmul float %a, %c + %2 = fmul float %b, %d + %3 = fsub float %1, %2 +; FPSCALAR: vmls.f32 {{.*}} + + %4 = fmul float %a, %d + %5 = fmul float %b, %c + %6 = fadd float %5, %4 +; FPSCALAR: vmla.f32 {{.*}} + + %7 = fsub float %e, %3 + %8 = fsub float %f, %6 + %9 = fadd float %3, %8 + %10 = fadd float %6, %7 + %11 = fmul float %9, %10 + + ret float %11 +} + +define <2 x float> @test3(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, <2 x float> %e, <2 x float> %f) { +; FPVECTOR-LABEL: test3 + %1 = fmul <2 x float> %a, %c + %2 = fmul <2 x float> %b, %d + %3 = fsub <2 x float> %1, %2 +; FPVECTOR-NOT: vmls{{.*}} + + %4 = fmul <2 x float> %a, %d + %5 = fmul <2 x float> %b, %c + %6 = fadd <2 x float> %5, %4 +; FPVECTOR-NOT: vmla{{.*}} + + %7 = fsub <2 x float> %e, %3 + %8 = fsub <2 x float> %f, %6 + %9 = fadd <2 x float> %3, %8 + %10 = fadd <2 x float> %6, %7 + %11 = fmul <2 x float> %9, %10 + + ret <2 x float> %11 +} +