Hello,
Please review the patch that fuses MUL+ADDSUB operations into FMADDSUB
when AVX2 is available.
MUL+ADDSUB are often generated by LLVM (with -ffast-math flag) for
complex MUL operations.
C code:
#include <complex.h>
_Complex double a, b, dst;
void cmul() {
dst = a * b;
}
asm without patch:
vmovupd b(%rip), %xmm0 vmovddup a(%rip), %xmm1 # xmm1 = mem[0,0] vmulpd %xmm1, %xmm0, %xmm1 <<<<<<<<<<<<<<<<<<<<<<< vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0] vmovddup a+8(%rip), %xmm2 # xmm2 = mem[0,0] vmulpd %xmm2, %xmm0, %xmm0 vaddsubpd %xmm0, %xmm1, %xmm0 <<<<<<<<<<<<<<< vmovupd %xmm0, dst(%rip)
asm with the patch:
vmovupd b(%rip), %xmm0 vmovddup a(%rip), %xmm1 # xmm1 = mem[0,0] vpermilpd $1, %xmm0, %xmm2 # xmm2 = xmm0[1,0] vmovddup a+8(%rip), %xmm3 # xmm3 = mem[0,0] vmulpd %xmm3, %xmm2, %xmm2 vfmaddsub231pd %xmm1, %xmm0, %xmm2 <<<<<<<<<<<<<<<<<<< vmovupd %xmm2, dst(%rip)
Thank you,
Vyacheslav Klochkov