diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3114,6 +3114,9 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) { SDNodeFlags Flags; + if (auto *FPOp = dyn_cast(&I)) { + Flags.copyFMF(*FPOp); + } if (auto *OFBinOp = dyn_cast(&I)) { Flags.setNoSignedWrap(OFBinOp->hasNoSignedWrap()); Flags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap()); diff --git a/llvm/test/CodeGen/X86/fused-fma.ll b/llvm/test/CodeGen/X86/fused-fma.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fused-fma.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=x86-64 -mcpu=haswell | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13" +target triple = "x86_64--linux-gnu" + +%jl_value_t = type opaque + +; This function should fuse the `fmul contract`, `fadd fast` into a vfma. +; CHECK-LABEL: julia_dotf +define double @julia_dotf(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40) %0) { +; CHECK-LABEL: julia_dotf: +; CHECK: # %bb.0: # %L13.lr.ph +; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovupd (%rax), %ymm1 +; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm1) + ymm0 +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +L13.lr.ph: + br label %vector.ph + +vector.ph: ; preds = %L13.lr.ph + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %vec.phi10 = phi <4 x double> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ] + %wide.load13 = load <4 x double>, <4 x double> addrspace(13)* undef, align 8 + %wide.load17 = load <4 x double>, <4 x double> addrspace(13)* undef, align 8 + %1 = fmul contract <4 x double> %wide.load13, %wide.load17 + %2 = fadd fast <4 x double> %vec.phi10, %1 + %3 = icmp eq i64 undef, undef + br i1 %3, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %bin.rdx = fadd fast <4 x double> %2, undef + %bin.rdx20 = fadd fast <4 x double> undef, %bin.rdx + %bin.rdx21 = fadd fast <4 x double> undef, %bin.rdx20 + %rdx.shuf = shufflevector <4 x double> %bin.rdx21, <4 x double> undef, <4 x i32> + %bin.rdx22 = fadd fast <4 x double> %bin.rdx21, %rdx.shuf + %rdx.shuf23 = shufflevector <4 x double> %bin.rdx22, <4 x double> undef, <4 x i32> + %bin.rdx24 = fadd fast <4 x double> %bin.rdx22, %rdx.shuf23 + %4 = extractelement <4 x double> %bin.rdx24, i32 0 + br label %L32 + +L32: ; preds = %middle.block + ret double %4 +}