Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -455,6 +455,9 @@
     return true;
   }
 
+  /// Enable aggressive FMA fusion on targets that want it.
+  bool enableAggressiveFMAFusion(EVT VT) const override;
+
   /// Returns the size of the platform's va_list object.
   unsigned getVaListSizeInBits(const DataLayout &DL) const override;
 
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -113,6 +113,12 @@
                                   "optimization"),
                          cl::init(true));
 
+static cl::opt<bool>
+EnableAggressiveFMA("aarch64-enable-aggressive-fma", cl::Hidden,
+                         cl::desc("Enable AArch64 aggressive fused "
+                                  "multiply-add"),
+                         cl::init(false));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -10976,6 +10982,21 @@
   return OptSize && !VT.isVector();
 }
 
+bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+  unsigned PF = static_cast<unsigned>(Subtarget->getProcFamily());
+  switch(PF) {
+  default:
+    return VT.isFloatingPoint() && EnableAggressiveFMA.getValue();
+    break;
+  case AArch64Subtarget::ThunderX2T99:
+    // Always enabled on Cavium T99.
+    return VT.isFloatingPoint();
+    break;
+  }
+
+  return false;
+}
+
 unsigned
 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
Index: test/CodeGen/AArch64/fma-aggressive.ll
===================================================================
--- test/CodeGen/AArch64/fma-aggressive.ll
+++ test/CodeGen/AArch64/fma-aggressive.ll
@@ -0,0 +1,346 @@
+; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=thunderx2t99 -fp-contract=fast < %s | FileCheck %s --check-prefix=CHECK-FMA
+; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=generic < %s | FileCheck %s --check-prefix=CHECK-GENERIC
+; /* This test program demonstrates the effects of enabling aggressive FMA
+;  * on AArch64. With aggressive FMA enabled, CodeGen will fuse instructions
+;  * for SDValues with one or more use. With aggressive FMA disabled, this
+;  * fusion does not happen.
+;  */
+;
+; /* clang -O2 -std=c99 -Wall -mcpu=thunderx2t99 -march=armv8.1-a+lse
+;  * -funroll-loops -ffast-math -Xclang -menable-unsafe-fp-math
+;  * -emit-llvm -S fma.c -o fma.ll
+;  */
+;
+; #include <stdio.h>
+; #include <stdlib.h>
+; #include <math.h>
+;
+; static const double AE[] = { -0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
+;                              0.00, 0.00, 0.00, 0.00 };
+;
+; static const double BE[] = { 1102499.00, -2.00, -3.00, -4.00, -5.00,
+;                              -6.00, -7.00, -8.00, -9.00, -10.00 };
+;
+; double reset(double x, double y)
+; {
+;   double i;
+;   if (modf(x, &i) == 0.0)
+;     return x + y;
+;
+;   return x - y;
+; }
+;
+; int main(int argc, char* argv[])
+; {
+;   int z;
+;   if (argc >= 2)
+;     z = atoi(argv[1]);
+;   else
+;     z = 10;
+;
+;   double a = 3.0;
+;   double b = 5.0;
+;   double c = 10.0;
+;   (void) fprintf(stderr, "a=%lf b=%lf c=%lf\n", a, b, c);
+;
+;   for (int i = 0; i < z; ++i) {
+;     double x = (double) i;
+;     double p1 = 1.0;
+;     double n1 = -1.0;
+;     double y;
+;
+;     if ((i % 2) == 0)
+;       y = p1;
+;     else
+;       y = n1;
+;
+;     a *= y + p1 + a;
+;     a *= y + ((c + a) * 2.0) - (a + y + x);
+;     a -= n1 * (y * a);
+;
+;     (void) fprintf(stderr, "a=%lf b=%lf c=%lf\n", a, b, c);
+;
+;     if ((i % 2) == 0)
+;       b = p1 - ((n1 + a) * (a - n1));
+;     else
+;       b = p1 + ((n1 - a) * (a - n1));
+;
+;     a *= (p1 - a) * x;
+;     b -= reset(x, b - y + (a + b));
+;
+;     (void) fprintf(stderr, "a=%lf b=%lf c=%lf", a, b, c);
+;     if ((a == AE[i]) && (b == BE[i]))
+;       (void) fprintf(stderr, "\t-----> PASS.");
+;     else {
+;       (void) fprintf(stderr, "\t-----> FAIL: ");
+;       if (a != AE[i])
+;         (void) fprintf(stderr, "(a == %lf, expected %lf)", a, AE[i]);
+;       else if (b != BE[i])
+;         (void) fprintf(stderr, "(b == %lf, expected %lf)", b, BE[i]);
+;     }
+;
+;     (void) fprintf(stderr, "\n\n");
+;   }
+;
+;   return 0;
+; }
+;
+; ModuleID = 'fma.c'
+source_filename = "fma.c"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+
+@stderr = external local_unnamed_addr global %struct._IO_FILE*, align 8
+@.str = private unnamed_addr constant [19 x i8] c"a=%lf b=%lf c=%lf\0A\00", align 1
+@.str.1 = private unnamed_addr constant [18 x i8] c"a=%lf b=%lf c=%lf\00", align 1
+@AE = internal unnamed_addr constant [10 x double] [double -0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00], align 8
+@BE = internal unnamed_addr constant [10 x double] [double 0x4130D2A300000000, double -2.000000e+00, double -3.000000e+00, double -4.000000e+00, double -5.000000e+00, double -6.000000e+00, double -7.000000e+00, double -8.000000e+00, double -9.000000e+00, double -1.000000e+01], align 8
+@.str.2 = private unnamed_addr constant [14 x i8] c"\09-----> PASS.\00", align 1
+@.str.3 = private unnamed_addr constant [15 x i8] c"\09-----> FAIL: \00", align 1
+@.str.4 = private unnamed_addr constant [24 x i8] c"(a == %lf, expected %lf\00", align 1
+@.str.5 = private unnamed_addr constant [24 x i8] c"(b == %lf, expected %lf\00", align 1
+@.str.6 = private unnamed_addr constant [3 x i8] c"\0A\0A\00", align 1
+
+; Function Attrs: nounwind
+define double @reset(double %x, double %y) local_unnamed_addr #0 {
+; CHECK-LABEL: reset:
+entry:
+  %i = alloca double, align 8
+  %0 = bitcast double* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) #3
+  %call = call fast double @modf(double %x, double* nonnull %i) #3
+  %cmp = fcmp fast oeq double %call, 0.000000e+00
+  %1 = fsub fast double -0.000000e+00, %y
+  %retval.0.p = select i1 %cmp, double %y, double %1
+  %retval.0 = fadd fast double %retval.0.p, %x
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) #3
+  ret double %retval.0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind
+declare double @modf(double, double* nocapture) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind
+define i32 @main(i32 %argc, i8** nocapture readonly %argv) local_unnamed_addr #0 {
+entry:
+  %i.i = alloca double, align 8
+  %cmp = icmp sgt i32 %argc, 1
+  br i1 %cmp, label %if.end, label %if.end.thread
+
+if.end.thread:                                    ; preds = %entry
+  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2
+  %call1140 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str, i64 0, i64 0), double 3.000000e+00, double 5.000000e+00, double 1.000000e+01) #4
+  br label %for.body.lr.ph
+
+if.end:                                           ; preds = %entry
+  %arrayidx = getelementptr inbounds i8*, i8** %argv, i64 1
+  %1 = load i8*, i8** %arrayidx, align 8, !tbaa !2
+  %call.i = tail call i64 @strtol(i8* nocapture nonnull %1, i8** null, i32 10) #3
+  %conv.i = trunc i64 %call.i to i32
+  %2 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2
+  %call1 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str, i64 0, i64 0), double 3.000000e+00, double 5.000000e+00, double 1.000000e+01) #4
+  %cmp2136 = icmp sgt i32 %conv.i, 0
+  br i1 %cmp2136, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %if.end.thread, %if.end
+  %z.0142 = phi i64 [ 10, %if.end.thread ], [ %call.i, %if.end ]
+  %3 = bitcast double* %i.i to i8*
+  %wide.trip.count = and i64 %z.0142, 4294967295
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end72, %if.end
+  ret i32 0
+
+for.body:                                         ; preds = %if.end72, %for.body.lr.ph
+; CHECK-FMA: fadd    d0, d9, d13
+; CHECK-FMA: tst     w26, #0x1
+; CHECK-FMA: fcsel   d15, d13, d12, eq
+; CHECK-GENERIC: tst     w26, #0x1
+; CHECK-GENERIC: fcsel   d15, d13, d12, eq
+; CHECK-GENERIC: fadd    d1, d13, d15
+; CHECK-GENERIC: fadd    d1, d9, d1
+; CHECK-GENERIC: fneg    d0, d11
+; CHECK-GENERIC: fmul    d1, d1, d9
+; CHECK-GENERIC: fadd    d2, d1, d8
+; CHECK-GENERIC: fsub    d0, d0, d1
+; CHECK-GENERIC: ldr     x0, [x28, :lo12:stderr]
+; CHECK-FMA: ldr     x0, [x28, :lo12:stderr]
+; CHECK-FMA: fadd    d0, d0, d15
+; CHECK-FMA: fmul    d1, d0, d9
+; CHECK-FMA: fmadd   d2, d0, d9, d8
+; CHECK-FMA: fnmadd  d0, d0, d9, d11
+; CHCK-FMA: mov     x1, x19
+; CHECK-FMA: fmadd   d0, d2, d14, d0
+; CHECK-GENERIC: fmadd   d0, d2, d14, d0
+; CHECK-FMA: mov     v2.16b, v8.16b
+; CHECK-FMA: fmul    d0, d0, d1
+; CHECK-GENERIC: fmul    d0, d0, d1
+; CHECK-FMA: mov     v1.16b, v10.16b
+; CHECK-FMA: fmadd   d9, d0, d15, d0
+; CHECK-FMA: mov     v0.16b, v9.16b
+; CHECK-GENERIC: fmadd   d9, d0, d15, d0
+; CHECK-GENERIC: tbnz    w26, #0, .LBB1_6
+; CHECK-GENERIC: fsub    d0, d13, d9
+; CHECK-GENERIC: b       .LBB1_7
+; CHECK-GENERIC: fmadd   d10, d0, d1, d13
+; CHECK-GENERIC: fsub    d0, d10, d15
+; CHECK-GENERIC: fadd    d0, d9, d0
+; CHECK-GENERIC: fadd    d15, d10, d0
+; CHECK-GENERIC: add     x0, sp, #8
+; CHECK-FMA: fsub    d0, d12, d9
+; CHECK-FMA: fadd    d1, d9, d13
+; CHECK-FMA: fsub    d2, d13, d9
+; CHECK-FMA: tst     w26, #0x1
+; CHECK-FMA: add     x0, sp, #8
+; CHECK-FMA: fmadd   d0, d0, d1, d13
+; CHECK-FMA: fmadd   d1, d2, d1, d13
+; CHECK-FMA: fcsel   d10, d0, d1, ne
+; CHECK-FMA: fmul    d0, d9, d11
+; CHECK-FMA: fsub    d1, d10, d15
+; CHECK-FMA: fmsub   d9, d9, d0, d0
+; CHECK-FMA: fmadd   d0, d0, d2, d10
+; CHECK-FMA: fadd    d15, d0, d1
+; CHECK-GENERIC: mov     x1, x20
+; CHECK-GENERIC: mov     v0.16b, v9.16b
+; CHECK-GENERIC: mov     v1.16b, v10.16b
+; CHECK-GENERIC: mov     v2.16b, v8.16b
+; CHECK-GENERIC: b.ne    .LBB1_10
+; CHECK-GENERIC: b.ne    .LBB1_10
+; CHECK-GENERIC: adrp    x1, .L.str.5
+; CHECK-GENERIC: mov     v0.16b, v10.16b
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %if.end72 ]
+  %a.0139 = phi double [ 3.000000e+00, %for.body.lr.ph ], [ %mul35, %if.end72 ]
+  %b.0138 = phi double [ 5.000000e+00, %for.body.lr.ph ], [ %sub40, %if.end72 ]
+  %4 = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %4 to double
+  %conv.neg = fsub fast double -0.000000e+00, %conv
+  %rem = and i32 %4, 1
+  %cmp3 = icmp eq i32 %rem, 0
+  %. = select i1 %cmp3, double 1.000000e+00, double -1.000000e+00
+  %add = fadd fast double %a.0139, 1.000000e+00
+  %add8 = fadd fast double %add, %.
+  %mul = fmul fast double %add8, %a.0139
+  %add9 = fadd fast double %mul, 1.000000e+01
+  %mul10 = fmul fast double %add9, 2.000000e+00
+  %add11 = fsub fast double %conv.neg, %mul
+  %sub = fadd fast double %add11, %mul10
+  %mul14 = fmul fast double %sub, %mul
+  %mul15 = fmul fast double %mul14, %.
+  %sub17 = fadd fast double %mul15, %mul14
+  %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2
+  %call18 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str, i64 0, i64 0), double %sub17, double %b.0138, double 1.000000e+01) #4
+  br i1 %cmp3, label %if.then22, label %if.else27
+
+if.then22:                                        ; preds = %for.body
+  %add23 = fadd fast double %sub17, -1.000000e+00
+  %sub24 = fadd fast double %sub17, 1.000000e+00
+  %mul25 = fmul fast double %add23, %sub24
+  %sub26 = fsub fast double 1.000000e+00, %mul25
+  br label %if.end32
+
+if.else27:                                        ; preds = %for.body
+  %sub28 = fsub fast double -1.000000e+00, %sub17
+  %sub29 = fadd fast double %sub17, 1.000000e+00
+  %mul30 = fmul fast double %sub28, %sub29
+  %add31 = fadd fast double %mul30, 1.000000e+00
+  br label %if.end32
+
+if.end32:                                         ; preds = %if.else27, %if.then22
+  %b.1 = phi double [ %sub26, %if.then22 ], [ %add31, %if.else27 ]
+  %sub33 = fsub fast double 1.000000e+00, %sub17
+  %mul34 = fmul fast double %sub17, %conv
+  %mul35 = fmul fast double %mul34, %sub33
+  %sub36 = fsub fast double %b.1, %.
+  %add37 = fadd fast double %b.1, %mul35
+  %add38 = fadd fast double %add37, %sub36
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3) #3
+  %call.i135 = call fast double @modf(double %conv, double* nonnull %i.i) #3
+  %cmp.i = fcmp fast oeq double %call.i135, 0.000000e+00
+  %6 = fsub fast double -0.000000e+00, %add38
+  %retval.0.p.i = select i1 %cmp.i, double %add38, double %6
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3) #3
+  %retval.0.i.neg = fsub fast double %b.1, %conv
+  %sub40 = fsub fast double %retval.0.i.neg, %retval.0.p.i
+  %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2
+  %call41 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.1, i64 0, i64 0), double %mul35, double %sub40, double 1.000000e+01) #4
+  %arrayidx42 = getelementptr inbounds [10 x double], [10 x double]* @AE, i64 0, i64 %indvars.iv
+  %8 = load double, double* %arrayidx42, align 8, !tbaa !6
+  %cmp43 = fcmp fast oeq double %mul35, %8
+  br i1 %cmp43, label %land.lhs.true, label %if.else51
+
+land.lhs.true:                                    ; preds = %if.end32
+  %arrayidx46 = getelementptr inbounds [10 x double], [10 x double]* @BE, i64 0, i64 %indvars.iv
+  %9 = load double, double* %arrayidx46, align 8, !tbaa !6
+  %cmp47 = fcmp fast oeq double %sub40, %9
+  br i1 %cmp47, label %if.then49, label %if.else51
+
+if.then49:                                        ; preds = %land.lhs.true
+  %10 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2
+  %11 = tail call i64 @fwrite(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.2, i64 0, i64 0), i64 13, i64 1, %struct._IO_FILE* %10) #4
+  br label %if.end72
+
+if.else51:                                        ; preds = %land.lhs.true, %if.end32
+  %12 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2
+  %13 = tail call i64 @fwrite(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.3, i64 0, i64 0), i64 14, i64 1, %struct._IO_FILE* %12) #4
+  %cmp55 = fcmp fast une double %mul35, %8
+  br i1 %cmp55, label %if.then57, label %if.else61
+
+if.then57:                                        ; preds = %if.else51
+  %14 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2
+  %call60 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %14, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.4, i64 0, i64 0), double %mul35, double %8) #4
+  br label %if.end72
+
+if.else61:                                        ; preds = %if.else51
+  %arrayidx63 = getelementptr inbounds [10 x double], [10 x double]* @BE, i64 0, i64 %indvars.iv
+  %15 = load double, double* %arrayidx63, align 8, !tbaa !6
+  %cmp64 = fcmp fast une double %sub40, %15
+  br i1 %cmp64, label %if.then66, label %if.end72
+
+if.then66:                                        ; preds = %if.else61
+  %16 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2
+  %call69 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %16, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.5, i64 0, i64 0), double %sub40, double %15) #4
+  br label %if.end72
+
+if.end72:                                         ; preds = %if.then57, %if.then66, %if.else61, %if.then49
+  %17 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2
+  %18 = tail call i64 @fwrite(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.6, i64 0, i64 0), i64 2, i64 1, %struct._IO_FILE* %17) #4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: nounwind
+declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare i64 @strtol(i8* readonly, i8** nocapture, i32) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #3
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="thunderx2t99" "target-features"="+lse,+neon,+v8.1a" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="thunderx2t99" "target-features"="+lse,+neon,+v8.1a" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { cold }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 9f9177d3ef72580ca29e8844327f63d7aa1908af) (http://llvm.org/git/llvm.git 3e48a4f4584fcf21e300affe64eb228647f4bb13)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"any pointer", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"double", !4, i64 0}
Index: test/CodeGen/AArch64/fma-simple.ll
===================================================================
--- test/CodeGen/AArch64/fma-simple.ll
+++ test/CodeGen/AArch64/fma-simple.ll
@@ -0,0 +1,13 @@
+; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=thunderx2t99 -fp-contract=fast < %s | FileCheck %s --check-prefix=CHECK-FMA
+; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=generic < %s | FileCheck %s --check-prefix=CHECK-GENERIC
+define double @test(double %x, double %y, double %z) {
+; CHECK-FMA: fmul    d3, d0, d1
+; CHECK-FMA: fmadd   d0, d0, d1, d2
+; CHECK-GENERIC: fmul    d0, d0, d1
+; CHECK-GENERIC: fadd    d1, d0, d2
+  %mul = fmul fast double %x, %y
+  %add = fadd fast double %mul, %z
+  %use2 = fdiv fast double %mul, %add
+  ret double %use2
+}
+