diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll --- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll @@ -6,6 +6,7 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) declare float @llvm.fma.f32(float, float, float) declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) ; This test checks combinations of FNEG and FMA intrinsics @@ -148,3 +149,114 @@ %t3 = tail call nsz <8 x float> @llvm.fma.v8f32(<8 x float> %t2, <8 x float> %b, <8 x float> %c) ret <8 x float> %t3 } + +define <4 x double> @test9(<4 x double> %a) { +; X32-LABEL: test9: +; X32: # %bb.0: +; X32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vbroadcastsd {{.*#+}} ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X32-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 +; X32-NEXT: retl +; +; X64-LABEL: test9: +; X64: # %bb.0: +; X64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X64-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 +; X64-NEXT: retq + %t = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> , <4 x double> ) + ret <4 x double> %t +} + +define <4 x double> @test10(<4 x double> %a, <4 x double> %b) { +; X32-LABEL: test10: +; X32: # %bb.0: +; X32-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vfmadd213pd {{.*#+}} ymm2 = (ymm0 * ymm2) + ymm1 +; X32-NEXT: vbroadcastsd {{.*#+}} ymm3 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X32-NEXT: vfmadd213pd {{.*#+}} ymm3 = (ymm0 * ymm3) + ymm1 +; X32-NEXT: vaddpd %ymm3, %ymm2, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test10: +; X64: # %bb.0: +; X64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vfmadd213pd {{.*#+}} ymm2 = (ymm0 * ymm2) + ymm1 +; X64-NEXT: vbroadcastsd {{.*#+}} ymm3 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X64-NEXT: vfmadd213pd {{.*#+}} ymm3 = (ymm0 * ymm3) + ymm1 +; X64-NEXT: vaddpd %ymm3, %ymm2, %ymm0 +; X64-NEXT: retq + %t0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> , <4 x double> %b) + %t1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> , <4 x double> %b) + %t2 = fadd <4 x double> %t0, %t1 + ret <4 x double> %t2 +} + +define <4 x double> @test11(<4 x double> %a) { +; X32-LABEL: test11: +; X32: # %bb.0: +; X32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X32-NEXT: vaddpd %ymm1, %ymm0, %ymm2 +; X32-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test11: +; X64: # %bb.0: +; X64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm2 +; X64-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 +; X64-NEXT: retq + %t0 = fadd <4 x double> %a, + %t1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %t0, <4 x double> , <4 x double> ) + ret <4 x double> %t1 +} + +define <4 x double> @test12(<4 x double> %a) { +; X32-LABEL: test12: +; X32: # %bb.0: +; X32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X32-NEXT: vaddpd %ymm1, %ymm0, %ymm2 +; X32-NEXT: vbroadcastsd {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; X32-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test12: +; X64: # %bb.0: +; X64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm2 +; X64-NEXT: vbroadcastsd {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; X64-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 +; X64-NEXT: retq + %t0 = fadd <4 x double> %a, + %t1 = fadd <4 x double> %a, + %t2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %t0, <4 x double> , <4 x double> %t1) + ret <4 x double> %t2 +} + +define <4 x double> @test13(<4 x double> %a, <4 x double> %b) { +; X32-LABEL: test13: +; X32: # %bb.0: +; X32-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vbroadcastsd {{.*#+}} ymm3 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X32-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm3 * ymm0) + ymm2 +; X32-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm2 +; X32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test13: +; X64: # %bb.0: +; X64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastsd {{.*#+}} ymm3 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; X64-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm3 * ymm0) + ymm2 +; X64-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm2 +; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %t0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> , <4 x double> ) + %t1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %b, <4 x double> , <4 x double> ) + %t2 = fadd <4 x double> %t0, %t1 + ret <4 x double> %t2 +} diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll --- a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll @@ -126,5 +126,108 @@ ret float %nfma } +define <4 x double> @negated_constant_v4f64(<4 x double> %a) { +; FMA3-LABEL: negated_constant_v4f64: +; FMA3: # %bb.0: +; FMA3-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; FMA3-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem +; FMA3-NEXT: retq +; +; FMA4-LABEL: negated_constant_v4f64: +; FMA4: # %bb.0: +; FMA4-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) + mem +; FMA4-NEXT: retq + %t = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> , <4 x double> ) + ret <4 x double> %t +} + +define <4 x double> @negated_constant_v4f64_2fmas(<4 x double> %a, <4 x double> %b) { +; FMA3-LABEL: negated_constant_v4f64_2fmas: +; FMA3: # %bb.0: +; FMA3-NEXT: vmovapd {{.*#+}} ymm2 = <-5.0E-1,u,-5.0E-1,-5.0E-1> +; FMA3-NEXT: vfmadd213pd {{.*#+}} ymm2 = (ymm0 * ymm2) + ymm1 +; FMA3-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm0 * mem) + ymm1 +; FMA3-NEXT: vaddpd %ymm1, %ymm2, %ymm0 +; FMA3-NEXT: retq +; +; FMA4-LABEL: negated_constant_v4f64_2fmas: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm2 = (ymm0 * mem) + ymm1 +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * mem) + ymm1 +; FMA4-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; FMA4-NEXT: retq + %t0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> , <4 x double> %b) + %t1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> , <4 x double> %b) + %t2 = fadd <4 x double> %t0, %t1 + ret <4 x double> %t2 +} + +define <4 x double> @negated_constant_v4f64_fadd(<4 x double> %a) { +; FMA3-LABEL: negated_constant_v4f64_fadd: +; FMA3: # %bb.0: +; FMA3-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; FMA3-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; FMA3-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem +; FMA3-NEXT: retq +; +; FMA4-LABEL: negated_constant_v4f64_fadd: +; FMA4: # %bb.0: +; FMA4-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; FMA4-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) + mem +; FMA4-NEXT: retq + %t0 = fadd <4 x double> %a, + %t1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %t0, <4 x double> , <4 x double> ) + ret <4 x double> %t1 +} + +define <4 x double> @negated_constant_v4f64_2fadd(<4 x double> %a) { +; FMA3-LABEL: negated_constant_v4f64_2fadd: +; FMA3: # %bb.0: +; FMA3-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; FMA3-NEXT: vaddpd %ymm1, %ymm0, %ymm2 +; FMA3-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; FMA3-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 +; FMA3-NEXT: retq +; +; FMA4-LABEL: negated_constant_v4f64_2fadd: +; FMA4: # %bb.0: +; FMA4-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; FMA4-NEXT: vaddpd %ymm1, %ymm0, %ymm2 +; FMA4-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm2 * ymm1) + ymm0 +; FMA4-NEXT: retq + %t0 = fadd <4 x double> %a, + %t1 = fadd <4 x double> %a, + %t2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %t0, <4 x double> , <4 x double> %t1) + ret <4 x double> %t2 +} + +define <4 x double> @negated_constant_v4f64_2fma_undefs(<4 x double> %a, <4 x double> %b) { +; FMA3-LABEL: negated_constant_v4f64_2fma_undefs: +; FMA3: # %bb.0: +; FMA3-NEXT: vmovapd {{.*#+}} ymm2 = <-5.0E-1,u,-5.0E-1,-5.0E-1> +; FMA3-NEXT: vmovapd {{.*#+}} ymm3 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; FMA3-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm3 * ymm0) + ymm2 +; FMA3-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm2 +; FMA3-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; FMA3-NEXT: retq +; +; FMA4-LABEL: negated_constant_v4f64_2fma_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = <-5.0E-1,u,-5.0E-1,-5.0E-1> +; FMA4-NEXT: vmovapd {{.*#+}} ymm3 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm2 +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm2 +; FMA4-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; FMA4-NEXT: retq + %t0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> , <4 x double> ) + %t1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %b, <4 x double> , <4 x double> ) + %t2 = fadd <4 x double> %t0, %t1 + ret <4 x double> %t2 +} + declare float @llvm.fma.f32(float, float, float) declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)