This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/Target/ARM/
-
Target/
-
ARM/
12
ARMISelDAGToDAG.cpp
4
MLxExpansionPass.cpp
-
test/CodeGen/ARM/
-
CodeGen/
-
ARM/
-
fmacs.ll
-
vmlx-fwd.ll

Differential D25020

[ARM] Fix 26% performance regression on Cortex-A9 caused by not using VMLA/VMLS
Needs ReviewPublic

Authored by eastig on Sep 28 2016, 7:31 AM.

Download Raw Diff

Details

Reviewers

rovka
rengolin
t.p.northover
jmolloy

Summary

We have 26% performance regression on Cortex-A9. We found it is caused by not using VMLA/VMLS.
There is ARMDAGToDAGISel::hasNoVMLxHazardUse which prevents generation of VMLx instructions for Cortex-A8 and Cortex-A9. Also there is a MLxExpansion pass which expands VMLx instructions in case of Cortex-A8 and Cortex-A9. The code is based on the note:

http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344k/BCGDCECC.html#ftn.CEGHAGEA

The VMLA.F and VMLS.F type instructions have additional restrictions that determine when they can be issued:
If a VMLA.F is followed by a VMLA.F with no RAW hazard, the second VFMLA.F issues with no stalls.
If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the VADD.F or VMUL.F stalls 4 cycles before issue. The 4 cycle stall preserves the in-order retirement of the instructions.
A VMLA.F followed by any NEON floating-point instruction with RAW hazard stalls for 8 cycles.

3.7.0/3.7.1 have a bug which causes Subtarget->isCortexA9() to return false even the specified CPU is Cortex-A9. So the code did not work and VMLx instructions were generated.
In 3.8.0 it is fixed:

$ cat fml.ll 
define double @test(double %a, double %b, double %c, double %d, double %e, double %f) #0 {
  %1 = fmul double %a, %c
  %2 = fmul double %b, %d
  %3 = fsub double %1, %2

  %4 = fmul double %a, %d
  %5 = fmul double %b, %c
  %6 = fadd double %5, %4

  %7 = fsub double %e, %3
  %8 = fsub double %f, %6
  %9 = fadd double %3, %8
  %10 = fadd double %6, %7
  %11 = fmul double %9, %10

  ret double %11
}

attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+neon,+vfp3,-crypto,-d16,-fp-armv8,-fp-only-sp,-fp16,-vfp4" "unsafe-fp-math"="false" "use-soft-float"="false" }

$ 3.7.1/bin/llc -mtriple=arm-eabi -mcpu=cortex-a9 fml.ll -o -
        .text
        .syntax unified
        .eabi_attribute 67, "2.09"      @ Tag_conformance
        .cpu    cortex-a9
        .eabi_attribute 6, 10   @ Tag_CPU_arch
        .eabi_attribute 7, 65   @ Tag_CPU_arch_profile
        .eabi_attribute 8, 1    @ Tag_ARM_ISA_use
        .eabi_attribute 9, 2    @ Tag_THUMB_ISA_use
        .fpu    neon-fp16
        .eabi_attribute 17, 1   @ Tag_ABI_PCS_GOT_use
        .eabi_attribute 20, 1   @ Tag_ABI_FP_denormal
        .eabi_attribute 21, 1   @ Tag_ABI_FP_exceptions
        .eabi_attribute 23, 3   @ Tag_ABI_FP_number_model
        .eabi_attribute 34, 0   @ Tag_CPU_unaligned_access
        .eabi_attribute 24, 1   @ Tag_ABI_align_needed
        .eabi_attribute 25, 1   @ Tag_ABI_align_preserved
        .eabi_attribute 36, 1   @ Tag_FP_HP_extension
        .eabi_attribute 38, 1   @ Tag_ABI_FP_16bit_format
        .eabi_attribute 42, 1   @ Tag_MPextension_use
        .eabi_attribute 14, 0   @ Tag_ABI_PCS_R9_use
        .eabi_attribute 68, 1   @ Tag_Virtualization_use
        .file   "fml.ll"
        .globl  test
        .align  2
        .type   test,%function
test:                                   @ @test
        .fnstart
@ BB#0:
        vldr    d16, [sp]
        vldr    d18, [sp, #8]
        vmov    d17, r0, r1
        vmul.f64        d19, d17, d16
        vmul.f64        d17, d17, d18
        vmov    d20, r2, r3
        vmls.f64        d19, d20, d18
        vmla.f64        d17, d20, d16
        vldr    d16, [sp, #16]
        vldr    d18, [sp, #24]
        vsub.f64        d16, d16, d19
        vsub.f64        d18, d18, d17
        vadd.f64        d16, d17, d16
        vadd.f64        d17, d19, d18
        vmul.f64        d16, d17, d16
        vmov    r0, r1, d16
        bx      lr
.Lfunc_end0:
        .size   test, .Lfunc_end0-test
        .cantunwind
        .fnend


        .section        ".note.GNU-stack","",%progbits

$ 3.8.0/bin/llc -mtriple=arm-eabi -mcpu=cortex-a9 fml.ll -o -
        .text
        .syntax unified
        .eabi_attribute 67, "2.09"      @ Tag_conformance
        .cpu    cortex-a9
        .eabi_attribute 6, 10   @ Tag_CPU_arch
        .eabi_attribute 7, 65   @ Tag_CPU_arch_profile
        .eabi_attribute 8, 1    @ Tag_ARM_ISA_use
        .eabi_attribute 9, 2    @ Tag_THUMB_ISA_use
        .fpu    neon-fp16
        .eabi_attribute 17, 1   @ Tag_ABI_PCS_GOT_use
        .eabi_attribute 20, 1   @ Tag_ABI_FP_denormal
        .eabi_attribute 21, 1   @ Tag_ABI_FP_exceptions
        .eabi_attribute 23, 3   @ Tag_ABI_FP_number_model
        .eabi_attribute 34, 1   @ Tag_CPU_unaligned_access
        .eabi_attribute 24, 1   @ Tag_ABI_align_needed
        .eabi_attribute 25, 1   @ Tag_ABI_align_preserved
        .eabi_attribute 36, 1   @ Tag_FP_HP_extension
        .eabi_attribute 38, 1   @ Tag_ABI_FP_16bit_format
        .eabi_attribute 42, 1   @ Tag_MPextension_use
        .eabi_attribute 14, 0   @ Tag_ABI_PCS_R9_use
        .eabi_attribute 68, 1   @ Tag_Virtualization_use
        .file   "fml.ll"
        .globl  test
        .align  2
        .type   test,%function
test:                                   @ @test
        .fnstart
@ BB#0:
        vldmia  sp, {d16, d17}
        vmov    d18, r2, r3
        vmov    d19, r0, r1
        vmul.f64        d20, d18, d17
        vmul.f64        d21, d19, d16
        vmul.f64        d17, d19, d17
        vmul.f64        d16, d18, d16
        vsub.f64        d18, d21, d20
        vldr    d19, [sp, #24]
        vadd.f64        d16, d16, d17
        vldr    d17, [sp, #16]
        vsub.f64        d17, d17, d18
        vsub.f64        d19, d19, d16
        vadd.f64        d16, d16, d17
        vadd.f64        d17, d18, d19
        vmul.f64        d16, d17, d16
        vmov    r0, r1, d16
        bx      lr
.Lfunc_end0:
        .size   test, .Lfunc_end0-test
        .cantunwind
        .fnend


        .section        ".note.GNU-stack","",%progbits
        .eabi_attribute 30, 1   @ Tag_ABI_optimization_goals

VMLx instructions can be faster on Cortex-A9 because of accumulator forwarding:

http://infocenter.arm.com/help/topic/com.arm.doc.ddi0409i/BCGDCIBA.html#ftn.id3445094

If a multiply-accumulate follows a multiply or another multiply-accumulate, and depends on the result of that first instruction, then if the dependency between both instructions
are of the same type and size, the processor uses a special multiplier accumulator forwarding. This special forwarding means the multiply instructions can issue back-to-back
because the result of the first instruction in cycle 5 is forwarded to the accumulator of the second instruction in cycle 4. If the size and type of the instructions do not match, then
Dd or Qd is required in cycle 3. This applies to combinations of the multiply-accumulate instructions VMLA, VMLS, VQDMLA, and VQDMLS, and the multiply instructions VMUL
and VQDMUL.

This patch fixes this issue.

Diff Detail

Build Status

Buildable 362
Build 362: arc lint + arc unit

Event Timeline

eastig updated this revision to Diff 72817.Sep 28 2016, 7:31 AM

eastig retitled this revision from to [ARM] Fix 26% performance regression on Cortex-A9 caused by not using VMLA/VMLS.

eastig updated this object.

eastig added reviewers: jmolloy, rengolin, t.p.northover.

eastig added a subscriber: llvm-commits.

Herald added subscribers: samparker, rengolin, aemerson. · View Herald TranscriptSep 28 2016, 7:31 AM

eastig updated this object.Sep 28 2016, 7:32 AM

Ping

Changed to use Subtarget->hasVMLxForwarding() to check if VMLx forwarding is supported.
Updated function names to reflect that VMLx forwarding is checked.

Hi,

Adding Diana, as she was looking into the VMLA hazard a few weeks ago in an attempt to remove a few CPU-specific flags. This review will need a deeper look in relation to the isLikeA9 check.

cheers,
--renato

Hi Evgeny,

Thanks for working on this.

Your patch looks good in general, but I have a few comments:

You mention that there's a performance regression, but you don't mention where - is it in a well-known benchmark or in proprietary code? If it's in proprietary code, I think it's customary to also get the results on the test-suite/SPEC/something to show that it doesn't break anything the community cares about.
The commit message keeps mentioning Cortex-A8 and Cortex-A9 together, but the MLx expansion pass is only enabled for Cortex-A9. If you have some performance numbers that look good for Cortex-A8, it would be a good idea to enable the pass for it too (just add FeatureExpandMLx to it in ARM.td).
I think the commit message is a bit TL;DR, could you condense it a bit? (e.g. keep only the relevant assembly snippets instead of the whole output)
Thanks for switching to using the subtarget feature, that would've been my first comment otherwise :)

Regards,
Diana

lib/Target/ARM/ARMISelDAGToDAG.cpp
421	I don't understand the first part of the comment, it's either incomplete or could use some rephrasing.
450	This isn't related, and since it's a typo fix you can just push it without review.
460	Why return false for vectors? AFAICT the Cortex-A9 manual says accumulator forwarding should apply for VQDMLA/VQDMLS. I see this assumption everywhere in the patch - maybe there should be a comment somewhere explaining it?
lib/Target/ARM/MLxExpansionPass.cpp
70	Did you run clang-format on this?
235	This looks a bit awkward and it doesn't seem to be on a very cold path either - maybe add a private flag and set it in runOnMachineFunction?
test/CodeGen/ARM/fml.ll
2 ↗	(On Diff #73445)	Could you add some run lines for cores that don't have accumulator forwarding, so we can test the other behavior as well? Also, can you rename the file to something more representative?
4 ↗	(On Diff #73445)	You should add a CHECK-LABEL directive for each function to make sure you're matching the expected instructions and not others appearing further down (doesn't seem likely now, but people may append tests to this file in the future).
19 ↗	(On Diff #73445)	Shouldn't this snippet have an expectation too? (Ditto in the other functions)

Hi Diana,

Thank you for comments.

You mention that there's a performance regression, but you don't mention where - is it in a well-known benchmark or in proprietary code? If it's in proprietary code, I think it's customary to also get the results on the test-suite/SPEC/something to show that it doesn't break anything the community cares about.

What I can say it is not in proprietary code. I am sorry but I am not allowed to provide more details here. Maybe it can be shared via our Linaro communication channel.
I'll check with other testsuites.

The commit message keeps mentioning Cortex-A8 and Cortex-A9 together, but the MLx expansion pass is only enabled for Cortex-A9. If you have some performance numbers that look good for Cortex-A8, it would be a good idea to enable the pass for it too (just add FeatureExpandMLx to it in ARM.td).

Yes, you are right. The MLx expansion pass is only for Cortex-A9. This is a copy-paste typo. I investigated the issue in May. At that time Cortex-A8 and Cortex-A9 were used in ARMDAGToDAGISel::hasNoVMLxHazardUse. Now they are not used because of your change to use features instead. I didn't notice this change when I updated my workspace to the latest trunk . So I used Cortex-A9 and Cortex-A8.

I have no performance data for Cortex-A8. We don't track Cortex-A8 only Cortex-A9. I'll try to find a Cortex-A8 board to check performance impact. First I'll check that the same problem has existed for Cortex-A8: stopped using VMLx instructions.

I think the commit message is a bit TL;DR, could you condense it a bit? (e.g. keep only the relevant assembly snippets instead of the whole output)

I use arc to submit changes for review but I use svn to commit the changes. So the commit message will be shorter.

Thanks,
Evgeny

eastig added inline comments.Oct 5 2016, 6:40 AM

lib/Target/ARM/ARMISelDAGToDAG.cpp
421	The comment the function is out of date and does not reflect the latest changes.
460	Yes, it should work according to the documentation. I'll add support of these cases and try to check if forwarding works.
lib/Target/ARM/MLxExpansionPass.cpp
70	No. Thank you for reminding about this. I always forget about clang-format.
235	I agree with you it does not look good. I usually add flags if values are needed more than once. I'll add a flag.
test/CodeGen/ARM/fml.ll
2 ↗	(On Diff #73445)	I'll do.
4 ↗	(On Diff #73445)	I'll do.
19 ↗	(On Diff #73445)	The purpose of the snippet is to use defined values. I'll make it simpler.

Updated according to the comments.

I ran the LNT testsuite on Cortex-A9.

The results are:

Performance Regressions - Execution Time

SingleSource/Benchmarks/Misc/himenobmtxpa	3.23%	VMLx. No obvious reason what caused this.
MultiSource/Benchmarks/McCat/09-vor/vor	3.08%	Don't see any VMLx related things
SingleSource/Benchmarks/Misc-C++/Large/ray	1.54%	VMLx
SingleSource/Benchmarks/Misc/mandel	1.53%	Possibly VMLx
MultiSource/Benchmarks/Olden/em3d/em3d	1.31%	Don't see any VMLx related things

Performance Improvements - Execution Time

SingleSource/Benchmarks/Misc/matmul_f64_4x4	-28.57%	VMLx
MultiSource/Benchmarks/ASC_Sequoia/IRSmk/IRSmk	-22.43%	VMLx
SingleSource/Benchmarks/Misc/ffbench	-16.02%	VMLx
MultiSource/Benchmarks/TSVC/LoopRerolling-flt/LoopRerolling-flt	-6.25%	VMLx
MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame	-6.15%	VMLx
SingleSource/Benchmarks/Misc/oourafft	-5.84%	VMLx
SingleSource/Benchmarks/Misc/pi	-5.39%	VMLx
MultiSource/Benchmarks/TSVC/LoopRerolling-dbl/LoopRerolling-dbl	-4.85%	VMLx
SingleSource/Benchmarks/Misc-C++/mandel-text	-3.41%	VMLx
MultiSource/Benchmarks/Bullet/bullet	-2.68%	VMLx
MultiSource/Benchmarks/FreeBench/pifft/pifft	-2.30%	VMLx
MultiSource/Benchmarks/TSVC/CrossingThresholds-dbl/CrossingThresholds-dbl	-2.07%	VMLx
SingleSource/Benchmarks/CoyoteBench/fftbench	-2.06%	VMLx
MultiSource/Benchmarks/FreeBench/distray/distray	-1.94%	VMLx
MultiSource/Benchmarks/SciMark2-C/scimark2	-1.90%	VMLx
SingleSource/Benchmarks/Misc-C++/oopack_v1p8	-1.70%	VMLx
MultiSource/Benchmarks/Olden/tsp/tsp	-1.41%	VMLx
MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000	-1.34%	VMLx

Thanks for getting the performance results.
What about the vector support?

lib/Target/ARM/ARMISelDAGToDAG.cpp
443	This is now checking only that the node can be lowered to VMLx. What happened to the part checking if forwarding can be used (i.e. mac following multiply or mac) and all the other checks?

What about the vector support?

Currently vector VMLx instructions are expanded by MLxExpansionPass. I am running the LNT testsuite to check if there is any performance gain when they are not expanded. I don't know if accumulator forwarding is used for vector VMLx. There is no such note for vector VMLx as for VFP VMLx.

lib/Target/ARM/ARMISelDAGToDAG.cpp
443	I removed FMA checks because of the following reasons: FMA is lowered either VFMA or a library call if a target does not support VFPv4. I have not found any information about accumulator forwarding for VFMA. I removed other checks because I could not write tests for them. Are there cases when they are false?

eastig added inline comments.Oct 12 2016, 5:51 AM

lib/Target/ARM/ARMISelDAGToDAG.cpp

443

Another point is that ARMDAGToDAGISel::hasNoVMLxHazardUse is only called instructions are combined into VMLx:

ARMInstrInfo.td:

// An 'fadd' node which checks for single non-hazardous use.
def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
  return hasNoVMLxHazardUse(N);
}]>;

// An 'fsub' node which checks for single non-hazardous use.
def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
  return hasNoVMLxHazardUse(N);
}]>;

rovka added inline comments.Oct 12 2016, 6:31 AM

lib/Target/ARM/ARMISelDAGToDAG.cpp
443	Ok, the FMA thing sounds reasonable. Some of the other checks are ok to remove, but from what I understand now you're only checking that you have a FADD/FSUB with a FMUL as its operand - this will be lowered to a VMLx, but in order to care about accumulator forwarding you need another VMUL/VMLA. The way it is written now, it will return true from hasNoVMLxHazardUse without actually looking at the uses, which may be bad if the use is some other NEON fp instruction (unless I'm missing something).

eastig added inline comments.Oct 12 2016, 8:43 AM

lib/Target/ARM/ARMISelDAGToDAG.cpp
443	I check that both operands are FMUL. See example: a = ISD::FMUL b = ISD::FMUL c, d ...= ISD::FADD a, b is transformed into a = ARM::VMUL ...= ARM::VMLA a, c, d Accumulator forwarding is used for 'a'. You are right some uses can be bad. Mixing VFP and SIMD instructions is not recommended (http://infocenter.arm.com/help/topic/com.arm.doc.ddi0409i/CHDEDCDC.html). I don't see any performance regressions in the LNT testsuite when there is a mix of VFP and SIMD instructions without using VMLx (current behaviour) and when VMLx instructions are used (my patch). So maybe in case of VFP we don't need to check uses. If I understand correctly the note from the page: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0409i/BCGDCIBA.html SIMD VMLx can have stalls depending on uses. It is not clear from the note if VFP instructions are affected. The current implementation checks both VFP and SIMD VMLx instructions. I think the current LNT run will show performance regressions when there are forwarding of SIMD accumulator to a VMLx instruction and bad uses after it. If there are performance regressions in case of SIMD VMLx then bad uses should be more important than accumulator forwarding.

I've got results of the LNT run when SIMD VMLx are not expanded. I checked my changes with vmlx-fwd.ll from here. It's strange but there are no performance changes.

Hi Diana,

Based on the results of the LNT runs I think the code checking accumulator forwarding is not needed at all. If I am correct only SIMD VMLx instructions can have issues. So we should check only them.
My thoughts:

We can have features: HasSIMDVMLxHazards and HasVFPVMLxHazards. HasVMLxHazards can be built on them.
The current checks in MLxExpansion and in ARMDAGToDAGISel::canUseVMLxForwarding are used only for SIMD instructions.

What do you think?

Thanks,
Evgeny

In D25020#569200, @eastig wrote:

Hi Diana,

Based on the results of the LNT runs I think the code checking accumulator forwarding is not needed at all.

Which code? You're checking accumulator forwarding both in ISel and in MLxExpansion.

If I am correct only SIMD VMLx instructions can have issues. So we should check only them.
My thoughts:

We can have features: HasSIMDVMLxHazards and HasVFPVMLxHazards. HasVMLxHazards can be built on them.

The VMLxHazards feature is enabled for Cortex-A7, A8, A9 and Swift. I think we need a better picture of the differences in behaviour between them before we rush to create more features.

The current checks in MLxExpansion and in ARMDAGToDAGISel::canUseVMLxForwarding are used only for SIMD instructions.

Regarding the ISel changes: those will be enabled for Cortex-A7, A8 and A9 (the intersection between HasVMLxHazards and HasVMLxForwarding). I'm a bit wary of making those changes without more benchmarking on A7 and A8 at least.

Thanks,
Diana

lib/Target/ARM/ARMISelDAGToDAG.cpp
443	Oops, sorry, I read that as an \|\| instead of &&. Anyway, you're missing the vmla - vmla case, where the fadd should have a fmul and a fadd (and this fadd should have a fmul operand itself).

In D25020#570269, @rovka wrote:

In D25020#569200, @eastig wrote:

Hi Diana,

Based on the results of the LNT runs I think the code checking accumulator forwarding is not needed at all.

Which code? You're checking accumulator forwarding both in ISel and in MLxExpansion.

I mean both checks in ISel and MLxExpansion.

I traced the history of the code and discussions related to it:
http://lists.llvm.org/pipermail/llvm-dev/2013-February/059201.html
http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html

What I've read changes the situation a little bit.
A benchmark suffered from VMLx was MILC from SPEC 2006. There were some other benchmarks but their names were not mentioned. MILC uses VFP instructions. So the problem is not SIMD specific. I'll run SPEC 2006 to check if it's still an issue.

If I am correct only SIMD VMLx instructions can have issues. So we should check only them.
My thoughts:

We can have features: HasSIMDVMLxHazards and HasVFPVMLxHazards. HasVMLxHazards can be built on them.

The VMLxHazards feature is enabled for Cortex-A7, A8, A9 and Swift. I think we need a better picture of the differences in behaviour between them before we rush to create more features.

Base on new facts I agree with you.

The current checks in MLxExpansion and in ARMDAGToDAGISel::canUseVMLxForwarding are used only for SIMD instructions.

Regarding the ISel changes: those will be enabled for Cortex-A7, A8 and A9 (the intersection between HasVMLxHazards and HasVMLxForwarding). I'm a bit wary of making those changes without more benchmarking on A7 and A8 at least.

The problem is to get Cortex-A7 and Cortex-A8 hardware. We have a bare-metal Cortex-A8 board but no Cortex-A7.

So a question is: what to do if accumulator forwarding and a data hazard are detected?
The patch gives a preference for the accumulator forwarding.
From the past discussions I see the performance problems were with VFP code. This explains why I haven't seen any changes when I added support of SIMD to the patch (not published yet). It might mean adding it's worth to add support of SIMD to the patch.

Thanks,
Evgeny

I have a Cortex-A8 and I think Renato might have a Cortex-A7. I'm going to be away for the next 2 weeks, so I won't be able to help much now, but maybe when I get back we can sort out what hardware we have between us and run a few tests.

eastig added inline comments.Oct 14 2016, 8:26 AM

lib/Target/ARM/ARMISelDAGToDAG.cpp
443	I was thinking of this but I decided not to implement it till it gets clear what to do when there are accumulator forwarding and a data hazard at the same time. I'll make a test run to see performance impact of this.

In D25020#570505, @rovka wrote:

I have a Cortex-A8 and I think Renato might have a Cortex-A7. I'm going to be away for the next 2 weeks, so I won't be able to help much now, but maybe when I get back we can sort out what hardware we have between us and run a few tests.

SGTM. Meanwhile I will try to use our C-A8 board.

Hi Evgeny,

I think the best thing to do right now is to check the documentation, prepare a plan, and then test on the different cores.

On the manuals [1], I could only find cycle instructions for A8[2] and A9[3,4], but not for the others. So we'll have to assume something and test on the cores.

I imagine that A8's model is followed by A7 (in-order cores), while A9's model is followed by A15 and Krait (OOO cores), but we'll have to make sure our assumptions are correct. Benchmarks may be too big, so we coult try running fabricated snippets of VMUL/VADD/VMLA with and without dependency in tight loops, as they should yield big differences on different cores.

Another alternative would be to control MLxHazard and MLxForwarding via flags and run Benchmarks/Misc/matmul_f64_4x4, which seems to be the biggest difference of them all.

We have Cortex-A7 (RPi2), A8 (Beagle), A9 (Panda), A15 (Chromebooks) and Krait (Dragon). We'd only be missing Swift to make sure we have covered all relevant cores.

I think the best course of action now is to combine forces on coding and testing and come up with a concrete solution based on real data to apply those feature flags in the right cores. Right now, the situation is a big mess and I don't want to make it worse.

cheers,
--renato

[1] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.set.cortexa/index.html
[2] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0344k/BCGDCECC.html
[3] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0409i/BCGJIBBD.html
[4] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0409i/BCGDCIBA.html

Hi Renato,

I agree with you we have some kind of mess right now. I'll be back with some plan soon.
Should we continue the discussion here or move it to the llvm-dev list?

Thanks,
Evgeny

In D25020#570535, @eastig wrote:

I agree with you we have some kind of mess right now. I'll be back with some plan soon.
Should we continue the discussion here or move it to the llvm-dev list?

Better to sync on the list, as phab is not really great at email threads. :)

cheers,
--renato

Hi Renato,

I have not forgotten about this work.
I am very busy with some urgent tasks at the moment.

Thanks,
Evgeny

Revision Contents

Path

Size

lib/

Target/

ARM/

ARMISelDAGToDAG.cpp

34 lines

MLxExpansionPass.cpp

84 lines

test/

CodeGen/

ARM/

fmacs.ll

6 lines

vmlx-fwd.ll

57 lines

Diff 74250

lib/Target/ARM/ARMISelDAGToDAG.cpp

Show First 20 Lines • Show All 278 Lines • ▼ Show 20 Lines	private:
/// is, and assigns to PowerOfTwo the power of two that should be extracted		/// is, and assigns to PowerOfTwo the power of two that should be extracted
/// out and to NewMulConst the new constant to be multiplied by.		/// out and to NewMulConst the new constant to be multiplied by.
bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift,		bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift,
unsigned &PowerOfTwo, SDValue &NewMulConst) const;		unsigned &PowerOfTwo, SDValue &NewMulConst) const;

/// Replace N with M in CurDAG, in a way that also ensures that M gets		/// Replace N with M in CurDAG, in a way that also ensures that M gets
/// selected when N would have been selected.		/// selected when N would have been selected.
void replaceDAGValue(const SDValue &N, SDValue M);		void replaceDAGValue(const SDValue &N, SDValue M);

		bool canUseVMLxForwarding(const SDNode &N) const;
};		};
}		}

/// isInt32Immediate - This method tests to see if the node is a 32-bit constant		/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
/// operand. If so Imm will receive the 32-bit value.		/// operand. If so Imm will receive the 32-bit value.
static bool isInt32Immediate(SDNode *N, unsigned &Imm) {		static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {		if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
Imm = cast<ConstantSDNode>(N)->getZExtValue();		Imm = cast<ConstantSDNode>(N)->getZExtValue();
▲ Show 20 Lines • Show All 116 Lines • ▼ Show 20 Lines	N1 = CurDAG->getNode(ISD::AND, SDLoc(N1), MVT::i32,
Srl,		Srl,
CurDAG->getConstant(And_imm, SDLoc(Srl), MVT::i32));		CurDAG->getConstant(And_imm, SDLoc(Srl), MVT::i32));
N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,		N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,
N1, CurDAG->getConstant(TZ, SDLoc(Srl), MVT::i32));		N1, CurDAG->getConstant(TZ, SDLoc(Srl), MVT::i32));
CurDAG->UpdateNodeOperands(N, N0, N1);		CurDAG->UpdateNodeOperands(N, N0, N1);
}		}
}		}

		/// Check if VMLx accumulator forwarding can be used if the specified SDNode is
		rovkaUnsubmitted Not Done Reply Inline Actions I don't understand the first part of the comment, it's either incomplete or could use some rephrasing. rovka: I don't understand the first part of the comment, it's either incomplete or could use some…
		eastigAuthorUnsubmitted Not Done Reply Inline Actions The comment the function is out of date and does not reflect the latest changes. eastig: The comment the function is out of date and does not reflect the latest changes.
		/// lowered to a VMLx instruction.
		/// The specified SDNode can be lowered to a VMLx instruction if it is either
		/// ISD::FSUB or ISD::FADD using a result of ISD::FMUL.
		/// Special multiplier accumulator forwarding is used if a multiply-accumulate
		/// follows a multiply or another multiply-accumulate, and depends on the
		/// result of that first instruction.
		bool ARMDAGToDAGISel::canUseVMLxForwarding(const SDNode &N) const {
		if (!Subtarget->hasVMLxForwarding())
		return false;

		auto Op0Opcode = N.getOperand(0).getOpcode();
		auto Op1Opcode = N.getOperand(1).getOpcode();
		switch (N.getOpcode()) {
		default:
		return false;

		case ISD::FSUB:
		case ISD::FADD:
		if (Op0Opcode == ISD::FMUL && Op1Opcode == ISD::FMUL)
		return true;
		break;
		}
		rovkaUnsubmitted Not Done Reply Inline Actions This is now checking only that the node can be lowered to VMLx. What happened to the part checking if forwarding can be used (i.e. mac following multiply or mac) and all the other checks? rovka: This is now checking only that the node can be lowered to VMLx. What happened to the part…
		eastigAuthorUnsubmitted Not Done Reply Inline Actions I removed FMA checks because of the following reasons: FMA is lowered either VFMA or a library call if a target does not support VFPv4. I have not found any information about accumulator forwarding for VFMA. I removed other checks because I could not write tests for them. Are there cases when they are false? eastig: I removed FMA checks because of the following reasons: # FMA is lowered either VFMA or a…
		eastigAuthorUnsubmitted Not Done Reply Inline Actions Another point is that ARMDAGToDAGISel::hasNoVMLxHazardUse is only called instructions are combined into VMLx: ARMInstrInfo.td: // An 'fadd' node which checks for single non-hazardous use. def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; // An 'fsub' node which checks for single non-hazardous use. def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; eastig: Another point is that ARMDAGToDAGISel::hasNoVMLxHazardUse is only called instructions are…
		rovkaUnsubmitted Not Done Reply Inline Actions Ok, the FMA thing sounds reasonable. Some of the other checks are ok to remove, but from what I understand now you're only checking that you have a FADD/FSUB with a FMUL as its operand - this will be lowered to a VMLx, but in order to care about accumulator forwarding you need another VMUL/VMLA. The way it is written now, it will return true from hasNoVMLxHazardUse without actually looking at the uses, which may be bad if the use is some other NEON fp instruction (unless I'm missing something). rovka: Ok, the FMA thing sounds reasonable. Some of the other checks are ok to remove, but from what I…
		eastigAuthorUnsubmitted Not Done Reply Inline Actions I check that both operands are FMUL. See example: a = ISD::FMUL b = ISD::FMUL c, d ...= ISD::FADD a, b is transformed into a = ARM::VMUL ...= ARM::VMLA a, c, d Accumulator forwarding is used for 'a'. You are right some uses can be bad. Mixing VFP and SIMD instructions is not recommended (http://infocenter.arm.com/help/topic/com.arm.doc.ddi0409i/CHDEDCDC.html). I don't see any performance regressions in the LNT testsuite when there is a mix of VFP and SIMD instructions without using VMLx (current behaviour) and when VMLx instructions are used (my patch). So maybe in case of VFP we don't need to check uses. If I understand correctly the note from the page: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0409i/BCGDCIBA.html SIMD VMLx can have stalls depending on uses. It is not clear from the note if VFP instructions are affected. The current implementation checks both VFP and SIMD VMLx instructions. I think the current LNT run will show performance regressions when there are forwarding of SIMD accumulator to a VMLx instruction and bad uses after it. If there are performance regressions in case of SIMD VMLx then bad uses should be more important than accumulator forwarding. eastig: I check that both operands are FMUL. See example: ``` a = ISD::FMUL b = ISD::FMUL c, d ...=…
		rovkaUnsubmitted Not Done Reply Inline Actions Oops, sorry, I read that as an \|\| instead of &&. Anyway, you're missing the vmla - vmla case, where the fadd should have a fmul and a fadd (and this fadd should have a fmul operand itself). rovka: Oops, sorry, I read that as an \|\| instead of &&. Anyway, you're missing the vmla - vmla case…
		eastigAuthorUnsubmitted Not Done Reply Inline Actions I was thinking of this but I decided not to implement it till it gets clear what to do when there are accumulator forwarding and a data hazard at the same time. I'll make a test run to see performance impact of this. eastig: I was thinking of this but I decided not to implement it till it gets clear what to do when…

		return false;
		}

/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS		/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at		/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
/// least on current ARM implementations) which should be avoidded.		/// least on current ARM implementations) which should be avoided.
		rovkaUnsubmitted Not Done Reply Inline Actions This isn't related, and since it's a typo fix you can just push it without review. rovka: This isn't related, and since it's a typo fix you can just push it without review.
bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {		bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
if (OptLevel == CodeGenOpt::None)		if (OptLevel == CodeGenOpt::None)
return true;		return true;

if (!Subtarget->hasVMLxHazards())		if (!Subtarget->hasVMLxHazards())
return true;		return true;

		if (canUseVMLxForwarding(*N))
		return true;

		rovkaUnsubmitted Not Done Reply Inline Actions Why return false for vectors? AFAICT the Cortex-A9 manual says accumulator forwarding should apply for VQDMLA/VQDMLS. I see this assumption everywhere in the patch - maybe there should be a comment somewhere explaining it? rovka: Why return false for vectors? AFAICT the Cortex-A9 manual says accumulator forwarding should…
		eastigAuthorUnsubmitted Not Done Reply Inline Actions Yes, it should work according to the documentation. I'll add support of these cases and try to check if forwarding works. eastig: Yes, it should work according to the documentation. I'll add support of these cases and try to…
if (!N->hasOneUse())		if (!N->hasOneUse())
return false;		return false;

SDNode Use = N->use_begin();		SDNode Use = N->use_begin();
if (Use->getOpcode() == ISD::CopyToReg)		if (Use->getOpcode() == ISD::CopyToReg)
return true;		return true;
if (Use->isMachineOpcode()) {		if (Use->isMachineOpcode()) {
const ARMBaseInstrInfo TII = static_cast<const ARMBaseInstrInfo >(		const ARMBaseInstrInfo TII = static_cast<const ARMBaseInstrInfo >(
▲ Show 20 Lines • Show All 4,070 Lines • Show Last 20 Lines

lib/Target/ARM/MLxExpansionPass.cpp

Show All 24 Lines
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetRegisterInfo.h"		#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "mlx-expansion"		#define DEBUG_TYPE "mlx-expansion"

static cl::opt<bool>		static cl::opt<bool>
ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);		ForceExpand("expand-all-fp-mlx", cl::init(false), cl::Hidden);
static cl::opt<unsigned>		static cl::opt<unsigned>
ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);		ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);

STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");		STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");

namespace {		namespace {
struct MLxExpansion : public MachineFunctionPass {		struct MLxExpansion : public MachineFunctionPass {
static char ID;		static char ID;
MLxExpansion() : MachineFunctionPass(ID) {}		MLxExpansion() : MachineFunctionPass(ID) {}

bool runOnMachineFunction(MachineFunction &Fn) override;		bool runOnMachineFunction(MachineFunction &Fn) override;

StringRef getPassName() const override {		StringRef getPassName() const override {
return "ARM MLA / MLS expansion pass";		return "ARM MLA / MLS expansion pass";
}		}

private:		private:
const ARMBaseInstrInfo *TII;		const ARMBaseInstrInfo *TII;
const TargetRegisterInfo *TRI;		const TargetRegisterInfo *TRI;
		const ARMSubtarget *STI;
MachineRegisterInfo *MRI;		MachineRegisterInfo *MRI;

bool isLikeA9;		bool isLikeA9;
bool isSwift;		bool isSwift;
unsigned MIIdx;		unsigned MIIdx;
MachineInstr* LastMIs[4];		MachineInstr* LastMIs[4];
SmallPtrSet<MachineInstr*, 4> IgnoreStall;		SmallPtrSet<MachineInstr*, 4> IgnoreStall;
		SmallPtrSet<MachineInstr*, 4> AccForwarding;

void clearStack();		void clearStack();
void pushStack(MachineInstr *MI);		void pushStack(MachineInstr *MI);
MachineInstr getAccDefMI(MachineInstr MI) const;		MachineInstr getAccDefMI(MachineInstr MI) const;
unsigned getDefReg(MachineInstr *MI) const;		unsigned getDefReg(MachineInstr *MI) const;
bool hasLoopHazard(MachineInstr *MI) const;		bool hasLoopHazard(MachineInstr *MI) const;
bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;		bool hasRAWHazard(MachineInstr MI, MachineInstr NextMI) const;
		bool canUseVMLxForwarding(MachineInstr MI, MachineInstr AccDef) const;
bool FindMLxHazard(MachineInstr *MI);		bool FindMLxHazard(MachineInstr *MI);
		rovkaUnsubmitted Not Done Reply Inline Actions Did you run clang-format on this? rovka: Did you run clang-format on this?
		eastigAuthorUnsubmitted Not Done Reply Inline Actions No. Thank you for reminding about this. I always forget about clang-format. eastig: No. Thank you for reminding about this. I always forget about clang-format.
void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,		void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
unsigned MulOpc, unsigned AddSubOpc,		unsigned MulOpc, unsigned AddSubOpc,
bool NegAcc, bool HasLane);		bool NegAcc, bool HasLane);
bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);		bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);
};		};
char MLxExpansion::ID = 0;		char MLxExpansion::ID = 0;
}		}

▲ Show 20 Lines • Show All 101 Lines • ▼ Show 20 Lines	outer_continue:
}		}

break;		break;
}		}

return DefMI == MI;		return DefMI == MI;
}		}

bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {		bool MLxExpansion::hasRAWHazard(MachineInstr MI, MachineInstr NextMI) const {
		unsigned Reg = getDefReg(MI);
// FIXME: Detect integer instructions properly.		// FIXME: Detect integer instructions properly.
const MCInstrDesc &MCID = MI->getDesc();		const MCInstrDesc &MCID = NextMI->getDesc();
unsigned Domain = MCID.TSFlags & ARMII::DomainMask;		unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
if (MI->mayStore())		if (NextMI->mayStore())
return false;		return false;
unsigned Opcode = MCID.getOpcode();		unsigned Opcode = MCID.getOpcode();
if (Opcode == ARM::VMOVRS \|\| Opcode == ARM::VMOVRRD)		if (Opcode == ARM::VMOVRS \|\| Opcode == ARM::VMOVRRD)
return false;		return false;
if ((Domain & ARMII::DomainVFP) \|\| (Domain & ARMII::DomainNEON))		if (Domain & ARMII::DomainNEON)
return MI->readsRegister(Reg, TRI);		return NextMI->readsRegister(Reg, TRI);
		else if (Domain & ARMII::DomainVFP)
		return NextMI->readsRegister(Reg, TRI) && !AccForwarding.count(MI);

return false;		return false;
}		}

static bool isFpMulInstruction(unsigned Opcode) {		static bool isFpMulInstruction(unsigned Opcode) {
switch (Opcode) {		switch (Opcode) {
case ARM::VMULS:		case ARM::VMULS:
case ARM::VMULfd:		case ARM::VMULfd:
case ARM::VMULfq:		case ARM::VMULfq:
case ARM::VMULD:		case ARM::VMULD:
case ARM::VMULslfd:		case ARM::VMULslfd:
case ARM::VMULslfq:		case ARM::VMULslfq:
return true;		return true;
default:		default:
return false;		return false;
}		}
}		}

		/// Check if VMLx accumulator forwarding can be used from the instruction
		/// AccDef defining the accumulator to the VMLx instruction MI using it.
		/// Special multiplier accumulator forwarding is used if a multiply-accumulate
		/// follows a multiply or another multiply-accumulate, and depends on the
		/// result of that first instruction.
		bool MLxExpansion::canUseVMLxForwarding(MachineInstr *MI,
		MachineInstr *AccDef) const {
		assert(STI);
		assert(MI);
		assert(AccDef);
		assert(TII->isFpMLxInstruction(MI->getOpcode()));

		if (!STI->hasVMLxForwarding())
		return false;

		const auto AccDefOpcode = AccDef->getOpcode();
		rovkaUnsubmitted Not Done Reply Inline Actions This looks a bit awkward and it doesn't seem to be on a very cold path either - maybe add a private flag and set it in runOnMachineFunction? rovka: This looks a bit awkward and it doesn't seem to be on a very cold path either - maybe add a…
		eastigAuthorUnsubmitted Not Done Reply Inline Actions I agree with you it does not look good. I usually add flags if values are needed more than once. I'll add a flag. eastig: I agree with you it does not look good. I usually add flags if values are needed more than once.
		switch (MI->getOpcode()) {
		default:
		return false;

		case ARM::VMLAS:
		case ARM::VMLSS:
		switch (AccDefOpcode) {
		default:
		return false;

		case ARM::VMLAS:
		case ARM::VMLSS:
		case ARM::VMULS:
		return true;
		}
		break;

		case ARM::VMLAD:
		case ARM::VMLSD:
		switch (AccDefOpcode) {
		default:
		return false;

		case ARM::VMLAD:
		case ARM::VMLSD:
		case ARM::VMULD:
		return true;
		}
		break;
		}

		return false;
		}

bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {		bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
if (NumExpand >= ExpandLimit)		if (NumExpand >= ExpandLimit)
return false;		return false;

if (ForceExapnd)		if (ForceExpand)
return true;		return true;

MachineInstr *DefMI = getAccDefMI(MI);		MachineInstr *DefMI = getAccDefMI(MI);

		if (canUseVMLxForwarding(MI, DefMI)) {
		AccForwarding.insert(DefMI);
		return false;
		}

if (TII->isFpMLxInstruction(DefMI->getOpcode())) {		if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
// r0 = vmla		// r0 = vmla
// r3 = vmla r0, r1, r2		// r3 = vmla r0, r1, r2
// takes 16 - 17 cycles		// takes 16 - 17 cycles
//		//
// r0 = vmla		// r0 = vmla
// r4 = vmul r1, r2		// r4 = vmul r1, r2
// r3 = vadd r0, r4		// r3 = vadd r0, r4
Show All 25 Lines	if (!NextMI)
continue;		continue;

if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {		if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
if (i <= Limit1)		if (i <= Limit1)
return true;		return true;
}		}

// Look for VMLx RAW hazard.		// Look for VMLx RAW hazard.
if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))		if (i <= Limit2 && hasRAWHazard(MI, NextMI))
return true;		return true;
}		}

return false;		return false;
}		}

/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair		/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair
/// of MUL + ADD / SUB instructions.		/// of MUL + ADD / SUB instructions.
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
++NumExpand;		++NumExpand;
}		}

bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {		bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
bool Changed = false;		bool Changed = false;

clearStack();		clearStack();
IgnoreStall.clear();		IgnoreStall.clear();
		AccForwarding.clear();

unsigned Skip = 0;		unsigned Skip = 0;
MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();		MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
while (MII != E) {		while (MII != E) {
MachineInstr MI = &MII++;		MachineInstr MI = &MII++;

if (MI->isPosition() \|\| MI->isImplicitDef() \|\| MI->isCopy())		if (MI->isPosition() \|\| MI->isImplicitDef() \|\| MI->isCopy())
continue;		continue;
Show All 31 Lines

bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {		bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
if (skipFunction(*Fn.getFunction()))		if (skipFunction(*Fn.getFunction()))
return false;		return false;

TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo());		TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo());
TRI = Fn.getSubtarget().getRegisterInfo();		TRI = Fn.getSubtarget().getRegisterInfo();
MRI = &Fn.getRegInfo();		MRI = &Fn.getRegInfo();
const ARMSubtarget *STI = &Fn.getSubtarget<ARMSubtarget>();		STI = &Fn.getSubtarget<ARMSubtarget>();
if (!STI->expandMLx())		if (!STI->expandMLx())
return false;		return false;
isLikeA9 = STI->isLikeA9() \|\| STI->isSwift();		isLikeA9 = STI->isLikeA9() \|\| STI->isSwift();
isSwift = STI->isSwift();		isSwift = STI->isSwift();

bool Modified = false;		bool Modified = false;
for (MachineBasicBlock &MBB : Fn)		for (MachineBasicBlock &MBB : Fn)
Modified \|= ExpandFPMLxInstructions(MBB);		Modified \|= ExpandFPMLxInstructions(MBB);

return Modified;		return Modified;
}		}

FunctionPass *llvm::createMLxExpansionPass() {		FunctionPass *llvm::createMLxExpansionPass() {
return new MLxExpansion();		return new MLxExpansion();
}		}

test/CodeGen/ARM/fmacs.ll

	Show First 20 Lines • Show All 83 Lines • ▼ Show 20 Lines
	; A8-LABEL: t5:			; A8-LABEL: t5:
	; A8: vmul.f32			; A8: vmul.f32
	; A8: vmul.f32			; A8: vmul.f32
	; A8: vadd.f32			; A8: vadd.f32
	; A8: vadd.f32			; A8: vadd.f32

	; A9-LABEL: t5:			; A9-LABEL: t5:
	; A9: vmla.f32			; A9: vmla.f32
	; A9: vmul.f32			; A9: vmla.f32
	; A9: vadd.f32

	; HARD-LABEL: t5:			; HARD-LABEL: t5:
	; HARD: vmla.f32 s4, s0, s1			; HARD: vmla.f32 s4, s0, s1
	; HARD: vmul.f32 s0, s2, s3			; HARD: vmla.f32 s4, s2, s3
	; HARD: vadd.f32 s0, s4, s0
	%0 = fmul float %a, %b			%0 = fmul float %a, %b
	%1 = fadd float %e, %0			%1 = fadd float %e, %0
	%2 = fmul float %c, %d			%2 = fmul float %c, %d
	%3 = fadd float %1, %2			%3 = fadd float %1, %2
	ret float %3			ret float %3
	}			}

test/CodeGen/ARM/vmlx-fwd.ll

This file was added.

				; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \| FileCheck %s -check-prefix=SCALAR
				; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \| FileCheck %s -check-prefix=VECTOR
				; RUN: llc -mtriple=arm-eabi -mcpu=swift %s -o - \| FileCheck %s -check-prefix=SWIFT

				; SWIFT-LABEL: test1:
				; SWIFT-NOT: vml{{.*}}

				; SCALAR-LABEL: test1:
				define double @test1(double %a, double %b, double %c, double %d) {
				%1 = fmul double %a, %c
				%2 = fmul double %b, %d
				%3 = fsub double %1, %2

				%4 = fmul double %a, %d
				%5 = fmul double %b, %c
				%6 = fadd double %5, %4
				; SCALAR: vml{{[as]}}.f64 {{.*}}
				; SCALAR: vml{{[as]}}.f64 {{.*}}

				%7 = fsub double %3, %6

				ret double %7
				}

				; SCALAR-LABEL: test2:
				define float @test2(float %a, float %b, float %c, float %d) {
				%1 = fmul float %a, %c
				%2 = fmul float %b, %d
				%3 = fsub float %1, %2

				%4 = fmul float %a, %d
				%5 = fmul float %b, %c
				%6 = fadd float %5, %4
				; SCALAR: vml{{[as]}}.f32 {{.*}}
				; SCALAR: vml{{[as]}}.f32 {{.*}}

				%7 = fsub float %3, %6

				ret float %7
				}

				; VECTOR-LABEL: test3
				; VECTOR-NOT: vml{{.*}}
				define <2 x float> @test3(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
				%1 = fmul <2 x float> %a, %c
				%2 = fmul <2 x float> %b, %d
				%3 = fsub <2 x float> %1, %2

				%4 = fmul <2 x float> %a, %d
				%5 = fmul <2 x float> %b, %c
				%6 = fadd <2 x float> %5, %4

				%7 = fsub <2 x float> %6, %3

				ret <2 x float> %7
				}

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Fix 26% performance regression on Cortex-A9 caused by not using VMLA/VMLSNeeds ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 74250

lib/Target/ARM/ARMISelDAGToDAG.cpp

lib/Target/ARM/MLxExpansionPass.cpp

test/CodeGen/ARM/fmacs.ll

test/CodeGen/ARM/vmlx-fwd.ll

[ARM] Fix 26% performance regression on Cortex-A9 caused by not using VMLA/VMLS
Needs ReviewPublic