transform fmin/fmax calls when possible (PR24314)

rotateright · rotateright · commit 57fd1dc5db88 · 2015-08-16T20:18:19.000Z
If we can ignore NaNs, fmin/fmax libcalls can become compare and select (this is what we turn std::min / std::max into). This IR should then be optimized in the backend to whatever is best for any given target. Eg, x86 can use minss/maxss instructions. This should solve PR24314: https://llvm.org/bugs/show_bug.cgi?id=24314 Differential Revision: http://reviews.llvm.org/D11866 llvm-svn: 245187
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -131,6 +131,7 @@ class LibCallSimplifier {
   Value *optimizePow(CallInst *CI, IRBuilder<> &B);
   Value *optimizeExp2(CallInst *CI, IRBuilder<> &B);
   Value *optimizeFabs(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFMinFMax(CallInst *CI, IRBuilder<> &B);
   Value *optimizeSqrt(CallInst *CI, IRBuilder<> &B);
   Value *optimizeSinCosPi(CallInst *CI, IRBuilder<> &B);
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1184,6 +1184,60 @@ Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
   return Ret;
 }
 
+Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
+  // If we can shrink the call to a float function rather than a double
+  // function, do that first.
+  Function *Callee = CI->getCalledFunction();
+  if ((Callee->getName() == "fmin" && TLI->has(LibFunc::fminf)) ||
+      (Callee->getName() == "fmax" && TLI->has(LibFunc::fmaxf))) {
+    Value *Ret = optimizeBinaryDoubleFP(CI, B);
+    if (Ret)
+      return Ret;
+  }
+
+  // Make sure this has 2 arguments of FP type which match the result type.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return nullptr;
+
+  // FIXME: For finer-grain optimization, we need intrinsics to have the same
+  // fast-math flag decorations that are applied to FP instructions. For now,
+  // we have to rely on the function-level attributes to do this optimization
+  // because there's no other way to express that the calls can be relaxed.
+  IRBuilder<true, ConstantFolder,
+    IRBuilderDefaultInserter<true> >::FastMathFlagGuard Guard(B);
+  FastMathFlags FMF;
+  Function *F = CI->getParent()->getParent();
+  Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+  if (Attr.getValueAsString() == "true") {
+    // Unsafe algebra sets all fast-math-flags to true.
+    FMF.setUnsafeAlgebra();
+  } else {
+    // At a minimum, no-nans-fp-math must be true.
+    Attr = F->getFnAttribute("no-nans-fp-math");
+    if (Attr.getValueAsString() != "true")
+      return nullptr;
+    // No-signed-zeros is implied by the definitions of fmax/fmin themselves:
+    // "Ideally, fmax would be sensitive to the sign of zero, for example
+    // fmax(−0. 0, +0. 0) would return +0; however, implementation in software
+    // might be impractical."
+    FMF.setNoSignedZeros();
+    FMF.setNoNaNs();
+  }
+  B.SetFastMathFlags(FMF);
+
+  // We have a relaxed floating-point environment. We can ignore NaN-handling
+  // and transform to a compare and select. We do not have to consider errno or
+  // exceptions, because fmin/fmax do not have those.
+  Value *Op0 = CI->getArgOperand(0);
+  Value *Op1 = CI->getArgOperand(1);
+  Value *Cmp = Callee->getName().startswith("fmin") ?
+    B.CreateFCmpOLT(Op0, Op1) : B.CreateFCmpOGT(Op0, Op1);
+  return B.CreateSelect(Cmp, Op0, Op1);
+}
+
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   
@@ -2110,11 +2164,16 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
         return optimizeUnaryDoubleFP(CI, Builder, true);
       return nullptr;
     case LibFunc::copysign:
-    case LibFunc::fmin:
-    case LibFunc::fmax:
       if (hasFloatVersion(FuncName))
         return optimizeBinaryDoubleFP(CI, Builder);
       return nullptr;
+    case LibFunc::fminf:
+    case LibFunc::fmin:
+    case LibFunc::fminl:
+    case LibFunc::fmaxf:
+    case LibFunc::fmax:
+    case LibFunc::fmaxl:
+      return optimizeFMinFMax(CI, Builder);
     default:
       return nullptr;
     }
diff --git a/llvm/test/Transforms/InstCombine/fast-math.ll b/llvm/test/Transforms/InstCombine/fast-math.ll
@@ -716,3 +716,110 @@ define fp128 @sqrt_call_squared_f128(fp128 %x) #0 {
 ; CHECK-NEXT: ret fp128 %fabs
 }
 
+; =========================================================================
+;
+;   Test-cases for fmin / fmax
+;
+; =========================================================================
+
+declare double @fmax(double, double)
+declare double @fmin(double, double)
+declare float @fmaxf(float, float)
+declare float @fminf(float, float)
+declare fp128 @fmaxl(fp128, fp128)
+declare fp128 @fminl(fp128, fp128)
+
+; No NaNs is the minimum requirement to replace these calls.
+; This should always be set when unsafe-fp-math is true, but
+; alternate the attributes for additional test coverage.
+; 'nsz' is implied by the definition of fmax or fmin itself.
+attributes #1 = { "no-nans-fp-math" = "true" }
+
+; Shrink and remove the call.
+define float @max1(float %a, float %b) #0 {
+  %c = fpext float %a to double
+  %d = fpext float %b to double
+  %e = call double @fmax(double %c, double %d)
+  %f = fptrunc double %e to float
+  ret float %f
+
+; CHECK-LABEL: max1(
+; CHECK-NEXT:  fcmp fast ogt float %a, %b 
+; CHECK-NEXT:  select {{.*}} float %a, float %b 
+; CHECK-NEXT:  ret
+}
+
+define float @max2(float %a, float %b) #1 {
+  %c = call float @fmaxf(float %a, float %b)
+  ret float %c
+
+; CHECK-LABEL: max2(
+; CHECK-NEXT:  fcmp nnan nsz ogt float %a, %b 
+; CHECK-NEXT:  select {{.*}} float %a, float %b 
+; CHECK-NEXT:  ret
+}
+
+
+define double @max3(double %a, double %b) #0 {
+  %c = call double @fmax(double %a, double %b)
+  ret double %c
+
+; CHECK-LABEL: max3(
+; CHECK-NEXT:  fcmp fast ogt double %a, %b 
+; CHECK-NEXT:  select {{.*}} double %a, double %b 
+; CHECK-NEXT:  ret
+}
+
+define fp128 @max4(fp128 %a, fp128 %b) #1 {
+  %c = call fp128 @fmaxl(fp128 %a, fp128 %b)
+  ret fp128 %c
+
+; CHECK-LABEL: max4(
+; CHECK-NEXT:  fcmp nnan nsz ogt fp128 %a, %b 
+; CHECK-NEXT:  select {{.*}} fp128 %a, fp128 %b 
+; CHECK-NEXT:  ret
+}
+
+; Shrink and remove the call.
+define float @min1(float %a, float %b) #1 {
+  %c = fpext float %a to double
+  %d = fpext float %b to double
+  %e = call double @fmin(double %c, double %d)
+  %f = fptrunc double %e to float
+  ret float %f
+
+; CHECK-LABEL: min1(
+; CHECK-NEXT:  fcmp nnan nsz olt float %a, %b 
+; CHECK-NEXT:  select {{.*}} float %a, float %b 
+; CHECK-NEXT:  ret
+}
+
+define float @min2(float %a, float %b) #0 {
+  %c = call float @fminf(float %a, float %b)
+  ret float %c
+
+; CHECK-LABEL: min2(
+; CHECK-NEXT:  fcmp fast olt float %a, %b 
+; CHECK-NEXT:  select {{.*}} float %a, float %b 
+; CHECK-NEXT:  ret
+}
+
+define double @min3(double %a, double %b) #1 {
+  %c = call double @fmin(double %a, double %b)
+  ret double %c
+
+; CHECK-LABEL: min3(
+; CHECK-NEXT:  fcmp nnan nsz olt double %a, %b 
+; CHECK-NEXT:  select {{.*}} double %a, double %b 
+; CHECK-NEXT:  ret
+}
+
+define fp128 @min4(fp128 %a, fp128 %b) #0 {
+  %c = call fp128 @fminl(fp128 %a, fp128 %b)
+  ret fp128 %c
+
+; CHECK-LABEL: min4(
+; CHECK-NEXT:  fcmp fast olt fp128 %a, %b 
+; CHECK-NEXT:  select {{.*}} fp128 %a, fp128 %b 
+; CHECK-NEXT:  ret
+}