Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -455,6 +455,9 @@ return true; } + /// Enable aggressive FMA fusion on targets that want it. + bool enableAggressiveFMAFusion(EVT VT) const override; + /// Returns the size of the platform's va_list object. unsigned getVaListSizeInBits(const DataLayout &DL) const override; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -113,6 +113,12 @@ "optimization"), cl::init(true)); +static cl::opt +EnableAggressiveFMA("aarch64-enable-aggressive-fma", cl::Hidden, + cl::desc("Enable AArch64 aggressive fused " + "multiply-add"), + cl::init(false)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -10976,6 +10982,21 @@ return OptSize && !VT.isVector(); } +bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { + unsigned PF = static_cast(Subtarget->getProcFamily()); + switch(PF) { + default: + return VT.isFloatingPoint() && EnableAggressiveFMA.getValue(); + break; + case AArch64Subtarget::ThunderX2T99: + // Always enabled on Cavium T99. + return VT.isFloatingPoint(); + break; + } + + return false; +} + unsigned AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) Index: test/CodeGen/AArch64/fma-aggressive.ll =================================================================== --- test/CodeGen/AArch64/fma-aggressive.ll +++ test/CodeGen/AArch64/fma-aggressive.ll @@ -0,0 +1,346 @@ +; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=thunderx2t99 -fp-contract=fast < %s | FileCheck %s --check-prefix=CHECK-FMA +; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=generic < %s | FileCheck %s --check-prefix=CHECK-GENERIC +; /* This test program demonstrates the effects of enabling aggressive FMA +; * on AArch64. With aggressive FMA enabled, CodeGen will fuse instructions +; * for SDValues with one or more use. With aggressive FMA disabled, this +; * fusion does not happen. +; */ +; +; /* clang -O2 -std=c99 -Wall -mcpu=thunderx2t99 -march=armv8.1-a+lse +; * -funroll-loops -ffast-math -Xclang -menable-unsafe-fp-math +; * -emit-llvm -S fma.c -o fma.ll +; */ +; +; #include +; #include +; #include +; +; static const double AE[] = { -0.00, 0.00, 0.00, 0.00, 0.00, 0.00, +; 0.00, 0.00, 0.00, 0.00 }; +; +; static const double BE[] = { 1102499.00, -2.00, -3.00, -4.00, -5.00, +; -6.00, -7.00, -8.00, -9.00, -10.00 }; +; +; double reset(double x, double y) +; { +; double i; +; if (modf(x, &i) == 0.0) +; return x + y; +; +; return x - y; +; } +; +; int main(int argc, char* argv[]) +; { +; int z; +; if (argc >= 2) +; z = atoi(argv[1]); +; else +; z = 10; +; +; double a = 3.0; +; double b = 5.0; +; double c = 10.0; +; (void) fprintf(stderr, "a=%lf b=%lf c=%lf\n", a, b, c); +; +; for (int i = 0; i < z; ++i) { +; double x = (double) i; +; double p1 = 1.0; +; double n1 = -1.0; +; double y; +; +; if ((i % 2) == 0) +; y = p1; +; else +; y = n1; +; +; a *= y + p1 + a; +; a *= y + ((c + a) * 2.0) - (a + y + x); +; a -= n1 * (y * a); +; +; (void) fprintf(stderr, "a=%lf b=%lf c=%lf\n", a, b, c); +; +; if ((i % 2) == 0) +; b = p1 - ((n1 + a) * (a - n1)); +; else +; b = p1 + ((n1 - a) * (a - n1)); +; +; a *= (p1 - a) * x; +; b -= reset(x, b - y + (a + b)); +; +; (void) fprintf(stderr, "a=%lf b=%lf c=%lf", a, b, c); +; if ((a == AE[i]) && (b == BE[i])) +; (void) fprintf(stderr, "\t-----> PASS."); +; else { +; (void) fprintf(stderr, "\t-----> FAIL: "); +; if (a != AE[i]) +; (void) fprintf(stderr, "(a == %lf, expected %lf)", a, AE[i]); +; else if (b != BE[i]) +; (void) fprintf(stderr, "(b == %lf, expected %lf)", b, BE[i]); +; } +; +; (void) fprintf(stderr, "\n\n"); +; } +; +; return 0; +; } +; +; ModuleID = 'fma.c' +source_filename = "fma.c" +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } + +@stderr = external local_unnamed_addr global %struct._IO_FILE*, align 8 +@.str = private unnamed_addr constant [19 x i8] c"a=%lf b=%lf c=%lf\0A\00", align 1 +@.str.1 = private unnamed_addr constant [18 x i8] c"a=%lf b=%lf c=%lf\00", align 1 +@AE = internal unnamed_addr constant [10 x double] [double -0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00], align 8 +@BE = internal unnamed_addr constant [10 x double] [double 0x4130D2A300000000, double -2.000000e+00, double -3.000000e+00, double -4.000000e+00, double -5.000000e+00, double -6.000000e+00, double -7.000000e+00, double -8.000000e+00, double -9.000000e+00, double -1.000000e+01], align 8 +@.str.2 = private unnamed_addr constant [14 x i8] c"\09-----> PASS.\00", align 1 +@.str.3 = private unnamed_addr constant [15 x i8] c"\09-----> FAIL: \00", align 1 +@.str.4 = private unnamed_addr constant [24 x i8] c"(a == %lf, expected %lf\00", align 1 +@.str.5 = private unnamed_addr constant [24 x i8] c"(b == %lf, expected %lf\00", align 1 +@.str.6 = private unnamed_addr constant [3 x i8] c"\0A\0A\00", align 1 + +; Function Attrs: nounwind +define double @reset(double %x, double %y) local_unnamed_addr #0 { +; CHECK-LABEL: reset: +entry: + %i = alloca double, align 8 + %0 = bitcast double* %i to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) #3 + %call = call fast double @modf(double %x, double* nonnull %i) #3 + %cmp = fcmp fast oeq double %call, 0.000000e+00 + %1 = fsub fast double -0.000000e+00, %y + %retval.0.p = select i1 %cmp, double %y, double %1 + %retval.0 = fadd fast double %retval.0.p, %x + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) #3 + ret double %retval.0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind +declare double @modf(double, double* nocapture) local_unnamed_addr #2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind +define i32 @main(i32 %argc, i8** nocapture readonly %argv) local_unnamed_addr #0 { +entry: + %i.i = alloca double, align 8 + %cmp = icmp sgt i32 %argc, 1 + br i1 %cmp, label %if.end, label %if.end.thread + +if.end.thread: ; preds = %entry + %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2 + %call1140 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str, i64 0, i64 0), double 3.000000e+00, double 5.000000e+00, double 1.000000e+01) #4 + br label %for.body.lr.ph + +if.end: ; preds = %entry + %arrayidx = getelementptr inbounds i8*, i8** %argv, i64 1 + %1 = load i8*, i8** %arrayidx, align 8, !tbaa !2 + %call.i = tail call i64 @strtol(i8* nocapture nonnull %1, i8** null, i32 10) #3 + %conv.i = trunc i64 %call.i to i32 + %2 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2 + %call1 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str, i64 0, i64 0), double 3.000000e+00, double 5.000000e+00, double 1.000000e+01) #4 + %cmp2136 = icmp sgt i32 %conv.i, 0 + br i1 %cmp2136, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %if.end.thread, %if.end + %z.0142 = phi i64 [ 10, %if.end.thread ], [ %call.i, %if.end ] + %3 = bitcast double* %i.i to i8* + %wide.trip.count = and i64 %z.0142, 4294967295 + br label %for.body + +for.cond.cleanup: ; preds = %if.end72, %if.end + ret i32 0 + +for.body: ; preds = %if.end72, %for.body.lr.ph +; CHECK-FMA: fadd d0, d9, d13 +; CHECK-FMA: tst w26, #0x1 +; CHECK-FMA: fcsel d15, d13, d12, eq +; CHECK-GENERIC: tst w26, #0x1 +; CHECK-GENERIC: fcsel d15, d13, d12, eq +; CHECK-GENERIC: fadd d1, d13, d15 +; CHECK-GENERIC: fadd d1, d9, d1 +; CHECK-GENERIC: fneg d0, d11 +; CHECK-GENERIC: fmul d1, d1, d9 +; CHECK-GENERIC: fadd d2, d1, d8 +; CHECK-GENERIC: fsub d0, d0, d1 +; CHECK-GENERIC: ldr x0, [x28, :lo12:stderr] +; CHECK-FMA: ldr x0, [x28, :lo12:stderr] +; CHECK-FMA: fadd d0, d0, d15 +; CHECK-FMA: fmul d1, d0, d9 +; CHECK-FMA: fmadd d2, d0, d9, d8 +; CHECK-FMA: fnmadd d0, d0, d9, d11 +; CHCK-FMA: mov x1, x19 +; CHECK-FMA: fmadd d0, d2, d14, d0 +; CHECK-GENERIC: fmadd d0, d2, d14, d0 +; CHECK-FMA: mov v2.16b, v8.16b +; CHECK-FMA: fmul d0, d0, d1 +; CHECK-GENERIC: fmul d0, d0, d1 +; CHECK-FMA: mov v1.16b, v10.16b +; CHECK-FMA: fmadd d9, d0, d15, d0 +; CHECK-FMA: mov v0.16b, v9.16b +; CHECK-GENERIC: fmadd d9, d0, d15, d0 +; CHECK-GENERIC: tbnz w26, #0, .LBB1_6 +; CHECK-GENERIC: fsub d0, d13, d9 +; CHECK-GENERIC: b .LBB1_7 +; CHECK-GENERIC: fmadd d10, d0, d1, d13 +; CHECK-GENERIC: fsub d0, d10, d15 +; CHECK-GENERIC: fadd d0, d9, d0 +; CHECK-GENERIC: fadd d15, d10, d0 +; CHECK-GENERIC: add x0, sp, #8 +; CHECK-FMA: fsub d0, d12, d9 +; CHECK-FMA: fadd d1, d9, d13 +; CHECK-FMA: fsub d2, d13, d9 +; CHECK-FMA: tst w26, #0x1 +; CHECK-FMA: add x0, sp, #8 +; CHECK-FMA: fmadd d0, d0, d1, d13 +; CHECK-FMA: fmadd d1, d2, d1, d13 +; CHECK-FMA: fcsel d10, d0, d1, ne +; CHECK-FMA: fmul d0, d9, d11 +; CHECK-FMA: fsub d1, d10, d15 +; CHECK-FMA: fmsub d9, d9, d0, d0 +; CHECK-FMA: fmadd d0, d0, d2, d10 +; CHECK-FMA: fadd d15, d0, d1 +; CHECK-GENERIC: mov x1, x20 +; CHECK-GENERIC: mov v0.16b, v9.16b +; CHECK-GENERIC: mov v1.16b, v10.16b +; CHECK-GENERIC: mov v2.16b, v8.16b +; CHECK-GENERIC: b.ne .LBB1_10 +; CHECK-GENERIC: b.ne .LBB1_10 +; CHECK-GENERIC: adrp x1, .L.str.5 +; CHECK-GENERIC: mov v0.16b, v10.16b + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %if.end72 ] + %a.0139 = phi double [ 3.000000e+00, %for.body.lr.ph ], [ %mul35, %if.end72 ] + %b.0138 = phi double [ 5.000000e+00, %for.body.lr.ph ], [ %sub40, %if.end72 ] + %4 = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %4 to double + %conv.neg = fsub fast double -0.000000e+00, %conv + %rem = and i32 %4, 1 + %cmp3 = icmp eq i32 %rem, 0 + %. = select i1 %cmp3, double 1.000000e+00, double -1.000000e+00 + %add = fadd fast double %a.0139, 1.000000e+00 + %add8 = fadd fast double %add, %. + %mul = fmul fast double %add8, %a.0139 + %add9 = fadd fast double %mul, 1.000000e+01 + %mul10 = fmul fast double %add9, 2.000000e+00 + %add11 = fsub fast double %conv.neg, %mul + %sub = fadd fast double %add11, %mul10 + %mul14 = fmul fast double %sub, %mul + %mul15 = fmul fast double %mul14, %. + %sub17 = fadd fast double %mul15, %mul14 + %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2 + %call18 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str, i64 0, i64 0), double %sub17, double %b.0138, double 1.000000e+01) #4 + br i1 %cmp3, label %if.then22, label %if.else27 + +if.then22: ; preds = %for.body + %add23 = fadd fast double %sub17, -1.000000e+00 + %sub24 = fadd fast double %sub17, 1.000000e+00 + %mul25 = fmul fast double %add23, %sub24 + %sub26 = fsub fast double 1.000000e+00, %mul25 + br label %if.end32 + +if.else27: ; preds = %for.body + %sub28 = fsub fast double -1.000000e+00, %sub17 + %sub29 = fadd fast double %sub17, 1.000000e+00 + %mul30 = fmul fast double %sub28, %sub29 + %add31 = fadd fast double %mul30, 1.000000e+00 + br label %if.end32 + +if.end32: ; preds = %if.else27, %if.then22 + %b.1 = phi double [ %sub26, %if.then22 ], [ %add31, %if.else27 ] + %sub33 = fsub fast double 1.000000e+00, %sub17 + %mul34 = fmul fast double %sub17, %conv + %mul35 = fmul fast double %mul34, %sub33 + %sub36 = fsub fast double %b.1, %. + %add37 = fadd fast double %b.1, %mul35 + %add38 = fadd fast double %add37, %sub36 + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3) #3 + %call.i135 = call fast double @modf(double %conv, double* nonnull %i.i) #3 + %cmp.i = fcmp fast oeq double %call.i135, 0.000000e+00 + %6 = fsub fast double -0.000000e+00, %add38 + %retval.0.p.i = select i1 %cmp.i, double %add38, double %6 + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3) #3 + %retval.0.i.neg = fsub fast double %b.1, %conv + %sub40 = fsub fast double %retval.0.i.neg, %retval.0.p.i + %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2 + %call41 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.1, i64 0, i64 0), double %mul35, double %sub40, double 1.000000e+01) #4 + %arrayidx42 = getelementptr inbounds [10 x double], [10 x double]* @AE, i64 0, i64 %indvars.iv + %8 = load double, double* %arrayidx42, align 8, !tbaa !6 + %cmp43 = fcmp fast oeq double %mul35, %8 + br i1 %cmp43, label %land.lhs.true, label %if.else51 + +land.lhs.true: ; preds = %if.end32 + %arrayidx46 = getelementptr inbounds [10 x double], [10 x double]* @BE, i64 0, i64 %indvars.iv + %9 = load double, double* %arrayidx46, align 8, !tbaa !6 + %cmp47 = fcmp fast oeq double %sub40, %9 + br i1 %cmp47, label %if.then49, label %if.else51 + +if.then49: ; preds = %land.lhs.true + %10 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2 + %11 = tail call i64 @fwrite(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.2, i64 0, i64 0), i64 13, i64 1, %struct._IO_FILE* %10) #4 + br label %if.end72 + +if.else51: ; preds = %land.lhs.true, %if.end32 + %12 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2 + %13 = tail call i64 @fwrite(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.3, i64 0, i64 0), i64 14, i64 1, %struct._IO_FILE* %12) #4 + %cmp55 = fcmp fast une double %mul35, %8 + br i1 %cmp55, label %if.then57, label %if.else61 + +if.then57: ; preds = %if.else51 + %14 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2 + %call60 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %14, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.4, i64 0, i64 0), double %mul35, double %8) #4 + br label %if.end72 + +if.else61: ; preds = %if.else51 + %arrayidx63 = getelementptr inbounds [10 x double], [10 x double]* @BE, i64 0, i64 %indvars.iv + %15 = load double, double* %arrayidx63, align 8, !tbaa !6 + %cmp64 = fcmp fast une double %sub40, %15 + br i1 %cmp64, label %if.then66, label %if.end72 + +if.then66: ; preds = %if.else61 + %16 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2 + %call69 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %16, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.5, i64 0, i64 0), double %sub40, double %15) #4 + br label %if.end72 + +if.end72: ; preds = %if.then57, %if.then66, %if.else61, %if.then49 + %17 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8, !tbaa !2 + %18 = tail call i64 @fwrite(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.6, i64 0, i64 0), i64 2, i64 1, %struct._IO_FILE* %17) #4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: nounwind +declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2 + +; Function Attrs: nounwind +declare i64 @strtol(i8* readonly, i8** nocapture, i32) local_unnamed_addr #2 + +; Function Attrs: nounwind +declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #3 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="thunderx2t99" "target-features"="+lse,+neon,+v8.1a" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="thunderx2t99" "target-features"="+lse,+neon,+v8.1a" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { nounwind } +attributes #4 = { cold } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 9f9177d3ef72580ca29e8844327f63d7aa1908af) (http://llvm.org/git/llvm.git 3e48a4f4584fcf21e300affe64eb228647f4bb13)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"any pointer", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = !{!7, !7, i64 0} +!7 = !{!"double", !4, i64 0} Index: test/CodeGen/AArch64/fma-simple.ll =================================================================== --- test/CodeGen/AArch64/fma-simple.ll +++ test/CodeGen/AArch64/fma-simple.ll @@ -0,0 +1,13 @@ +; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=thunderx2t99 -fp-contract=fast < %s | FileCheck %s --check-prefix=CHECK-FMA +; RUN: llc -O2 -mtriple=aarch64-none-linux-gnu -mcpu=generic < %s | FileCheck %s --check-prefix=CHECK-GENERIC +define double @test(double %x, double %y, double %z) { +; CHECK-FMA: fmul d3, d0, d1 +; CHECK-FMA: fmadd d0, d0, d1, d2 +; CHECK-GENERIC: fmul d0, d0, d1 +; CHECK-GENERIC: fadd d1, d0, d2 + %mul = fmul fast double %x, %y + %add = fadd fast double %mul, %z + %use2 = fdiv fast double %mul, %add + ret double %use2 +} +