Index: lib/Target/AArch64/AArch64.td =================================================================== --- lib/Target/AArch64/AArch64.td +++ lib/Target/AArch64/AArch64.td @@ -149,6 +149,12 @@ "lsl-fast", "HasLSLFast", "true", "CPU has a fastpath logical shift of up to 3 places">; +def FeatureAggressiveFMA : + SubtargetFeature<"enable-aggressive-fma", + "HasAggressiveFMA", + "true", + "Enable Aggressive FMA for floating-point.">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -399,7 +405,8 @@ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureLSE, - HasV8_1aOps]>; + HasV8_1aOps, + FeatureAggressiveFMA]>; def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX", "Cavium ThunderX processors", [ Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -456,6 +456,9 @@ return true; } + /// Enable aggressive FMA fusion on targets that want it. + bool enableAggressiveFMAFusion(EVT VT) const override; + /// Returns the size of the platform's va_list object. unsigned getVaListSizeInBits(const DataLayout &DL) const override; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10988,6 +10988,10 @@ return OptSize && !VT.isVector(); } +bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { + return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); +} + unsigned AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -80,6 +80,7 @@ bool HasLSLFast = false; bool HasSVE = false; bool HasRCPC = false; + bool HasAggressiveFMA = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -269,6 +270,7 @@ bool hasLSLFast() const { return HasLSLFast; } bool hasSVE() const { return HasSVE; } bool hasRCPC() const { return HasRCPC; } + bool hasAggressiveFMA() const { return HasAggressiveFMA; } bool isLittleEndian() const { return IsLittle; } Index: test/CodeGen/AArch64/fma-aggressive.ll =================================================================== --- test/CodeGen/AArch64/fma-aggressive.ll +++ test/CodeGen/AArch64/fma-aggressive.ll @@ -0,0 +1,103 @@ +; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=thunderx2t99 -fp-contract=fast < %s | FileCheck %s --check-prefix=CHECK-FMA +; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=generic < %s | FileCheck %s --check-prefix=CHECK-GENERIC +; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=generic -fp-contract=fast -mattr=+enable-aggressive-fma < %s | FileCheck %s --check-prefix=CHECK-FMA-FORCED +define double @test(double %x, double %y, double %z) { +; CHECK-FMA: fmul d3, d0, d1 +; CHECK-FMA: fmadd d0, d0, d1, d2 +; CHECK-FMA-FORCED: fmul d3, d0, d1 +; CHECK-FMA-FORCED: fmadd d0, d0, d1, d2 +; CHECK-GENERIC: fmul d0, d0, d1 +; CHECK-GENERIC: fadd d1, d0, d2 + %mul = fmul fast double %x, %y + %add = fadd fast double %mul, %z + %use2 = fdiv fast double %mul, %add + ret double %use2 +} + +define double @test1(double %a, double %b, double %conv, i32 %rem) { +entry: + %conv.neg = fsub fast double -0.000000e+00, %conv + %cmp3 = icmp eq i32 %rem, 0 + %. = select i1 %cmp3, double 1.000000e+00, double -1.000000e+00 +; CHECK-GENERIC: fmov d4, #-1.00000000 +; CHECK-GENERIC: fmov d1, #1.00000000 +; CHECK-FMA: fmov d3, #1.00000000 +; CHECK-FMA: fmov d1, #-1.00000000 +; CHECK-FMA-FORCED: fmov d4, #-1.00000000 +; CHECK-FMA-FORCED: fmov d1, #1.00000000 + %add = fadd fast double %a, 1.000000e+00 + %add8 = fadd fast double %add, %. + %mul = fmul fast double %add8, %a + %add9 = fadd fast double %mul, 1.000000e+01 +; CHECK-GENERIC: fneg d5, d2 +; CHECK-GENERIC: fmov d6, #10.00000000 +; CHECK-GENERIC: fmul d0, d7, d0 +; CHECK-GENERIC: fadd d6, d0, d6 +; CHECK-FMA: fmadd d7, d5, d0, d7 +; CHECK-FMA: fnmadd d0, d5, d0, d2 +; CHECK-FMA-FORCED: fmadd d5, d6, d0, d5 +; CHECK-FMA-FORCED: fnmadd d0, d6, d0, d2 + %mul10 = fmul fast double %add9, 2.000000e+00 + %add11 = fsub fast double %conv.neg, %mul + %sub = fadd fast double %add11, %mul10 +; CHECK-GENERIC: fmov d7, #2.00000000 +; CHECK-GENERIC: fmadd d5, d6, d7, d5 +; CHECK-GENERIC: fmul d0, d5, d0 +; CHECK-FMA: fmov d5, #2.00000000 +; CHECK-FMA: fmadd d0, d7, d5, d0 +; CHECK-FMA-FORCED: fmov d6, #2.00000000 +; CHECK-FMA-FORCED: fmadd d0, d5, d6, d0 + %mul14 = fmul fast double %sub, %mul + %mul15 = fmul fast double %mul14, %. + %sub17 = fadd fast double %mul15, %mul14 +; CHECK-GENERIC: fmadd d0, d0, d3, d0 +; CHECK-FMA: fmul d0, d0, d6 +; CHECK-FMA: fmadd d0, d0, d4, d0 +; CHECK-FMA-FORCED: fmul d0, d0, d7 +; CHECK-FMA-FORCED: fmadd d0, d0, d3, d0 + br i1 %cmp3, label %if.then22, label %if.else27 + +if.then22: + %add23 = fadd fast double %sub17, -1.000000e+00 + %sub24 = fadd fast double %sub17, 1.000000e+00 + %mul25 = fmul fast double %add23, %sub24 + %sub26 = fsub fast double 1.000000e+00, %mul25 +; CHECK-GENERIC: fsub d4, d4, d0 +; CHECK-GENERIC: fadd d5, d0, d1 +; CHECK-GENERIC: fmadd d4, d4, d5, d1 +; CHECK-FMA: fmadd d5, d5, d6, d3 +; CHECK-FMA: fmsub d1, d1, d6, d3 +; CHECK-FMA-FORCED: fmadd d4, d4, d5, d1 +; CHECK-FMA-FORCED: b .LBB1_3 +; CHECK-FMA-FORCED: fadd d4, d0, d4 +; CHECK-FMA-FORCED: fadd d5, d0, d1 +; CHECK-FMA-FORCED: fmsub d4, d4, d5, d1 + br label %if.end32 + +if.else27: + %sub28 = fsub fast double -1.000000e+00, %sub17 + %sub29 = fadd fast double %sub17, 1.000000e+00 + %mul30 = fmul fast double %sub28, %sub29 + %add31 = fadd fast double %mul30, 1.000000e+00 +; CHECK-GENERIC: fadd d4, d0, d4 +; CHECK-GENERIC: fadd d5, d0, d1 +; CHECK-GENERIC: fmsub d4, d4, d5, d1 +; CHECK-FMA: fsub d2, d1, d4 +; CHECK-FMA: fmadd d0, d0, d3, d1 +; CHECK-FMA: fadd d0, d0, d2 +; CHECK-FMA-FORCED: fsub d1, d1, d0 +; CHECK-FMA-FORCED: fmadd d0, d0, d1, d4 +; CHECK-FMA-FORCED: fadd d0, d0, d2 + br label %if.end32 + +if.end32: ; preds = %if.else27, %if.then22 + %b.1 = phi double [ %sub26, %if.then22 ], [ %add31, %if.else27 ] + %sub33 = fsub fast double 1.000000e+00, %sub17 + %mul34 = fmul fast double %sub17, %conv + %mul35 = fmul fast double %mul34, %sub33 + %sub36 = fsub fast double %b.1, %. + %add37 = fadd fast double %b.1, %mul35 + %add38 = fadd fast double %add37, %sub36 + ret double %add38 +} +