diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -28,7 +28,8 @@
   mlir::AffineDialect, FIROpsDialect, mlir::acc::OpenACCDialect,               \
       mlir::omp::OpenMPDialect, mlir::scf::SCFDialect,                         \
       mlir::arith::ArithmeticDialect, mlir::cf::ControlFlowDialect,            \
-      mlir::func::FuncDialect, mlir::vector::VectorDialect
+      mlir::func::FuncDialect, mlir::vector::VectorDialect,                    \
+      mlir::math::MathDialect
 
 // The definitive list of dialects used by flang.
 #define FLANG_DIALECT_LIST                                                     \
diff --git a/flang/lib/Lower/IntrinsicCall.cpp b/flang/lib/Lower/IntrinsicCall.cpp
--- a/flang/lib/Lower/IntrinsicCall.cpp
+++ b/flang/lib/Lower/IntrinsicCall.cpp
@@ -35,6 +35,7 @@
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
@@ -951,6 +952,40 @@
 // Math runtime description and matching utility
 //===----------------------------------------------------------------------===//
 
+/// Command line option to control how math operations are lowered
+/// into MLIR.
+/// Going forward, most of the math operations have to be lowered
+/// to some MLIR dialect operations, which are converted to
+/// library calls at the end of the FIR pipeline. Basically,
+/// the mathRuntimeVersion generation will happen for these
+/// math operations late during FIR conversion.
+///
+/// Exposing MLIR operations early can potentially enable more
+/// MLIR optimizations. At the same time, there are some issues
+/// with doing this, e.g. 'math' dialect operations do not model
+/// strict FP behavior right now, so the optimizations may change
+/// the program behavior comparing to when we represent intrinsic
+/// mathematical operations with generic calls. In order to preserve
+/// strict FP behavior with late math lowering we have to extend
+/// the dialects used by the late lowering such that they model strict
+/// FP behavior properly.
+enum MathLoweringMode {
+  // Lower math operations according to mathRuntimeVersion selection
+  // defined below.
+  earlyLowering,
+
+  // Lower math operations into operations of MLIR dialects,
+  // such as mlir::math, mlir::complex, etc.
+  lateLowering,
+};
+
+llvm::cl::opt<MathLoweringMode> mathLowering(
+    "math-lowering", llvm::cl::desc("Select math operations lowering mode:"),
+    llvm::cl::values(
+        clEnumValN(earlyLowering, "early", "lower to library calls early"),
+        clEnumValN(lateLowering, "late", "lower to MLIR dialect operations")),
+    llvm::cl::init(earlyLowering));
+
 /// Command line option to modify math runtime version used to implement
 /// intrinsics.
 enum MathRuntimeVersion {
@@ -975,6 +1010,8 @@
   // Needed for implicit compare with keys.
   constexpr operator Key() const { return key; }
   Key key; // intrinsic name
+
+  // Name of a runtime function that implements the operation.
   llvm::StringRef symbol;
   fir::runtime::FuncTypeBuilderFunc typeGenerator;
 };
@@ -1041,9 +1078,160 @@
   return mlir::FunctionType::get(context, {t}, {r});
 }
 
-// TODO : Fill-up this table with more intrinsic.
+template <int Bits>
+static mlir::FunctionType genF64F64IntFuncType(mlir::MLIRContext *context) {
+  auto ftype = mlir::FloatType::getF64(context);
+  auto itype = mlir::IntegerType::get(context, Bits);
+  return mlir::FunctionType::get(context, {ftype, itype}, {ftype});
+}
+
+template <int Bits>
+static mlir::FunctionType genF32F32IntFuncType(mlir::MLIRContext *context) {
+  auto ftype = mlir::FloatType::getF32(context);
+  auto itype = mlir::IntegerType::get(context, Bits);
+  return mlir::FunctionType::get(context, {ftype, itype}, {ftype});
+}
+
+/// Callback type for generating lowering for a math operation.
+using MathGeneratorTy = mlir::Value (*)(fir::FirOpBuilder &, mlir::Location,
+                                        llvm::StringRef name,
+                                        mlir::FunctionType funcType,
+                                        llvm::ArrayRef<mlir::Value>);
+
+struct MathOperation {
+  // llvm::StringRef comparison operator are not constexpr, so use string_view.
+  using Key = std::string_view;
+  // Needed for implicit compare with keys.
+  constexpr operator Key() const { return key; }
+  Key key; // intrinsic name
+
+  // Name of a runtime function that implements the operation.
+  llvm::StringRef symbol;
+  fir::runtime::FuncTypeBuilderFunc typeGenerator;
+
+  // If funcGenerator is non null, then it is generating
+  // the lowering code, otherwise - the lowering is done
+  // as a call to a runtime function named as specified
+  // in 'symbol' member.
+  MathGeneratorTy funcGenerator;
+};
+
+static mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc,
+                              llvm::StringRef name, mlir::FunctionType funcType,
+                              llvm::ArrayRef<mlir::Value> args) {
+  LLVM_DEBUG(llvm::dbgs() << "Generating '" << name << "' call with type ";
+             funcType.dump(); llvm::dbgs() << "\n");
+  mlir::func::FuncOp funcOp = builder.addNamedFunction(loc, name, funcType);
+  // TODO: ensure 'strictfp' setting on the call for "precise/strict"
+  //       FP mode. Set appropriate Fast-Math Flags otherwise.
+  // TODO: we should also mark as many libm function as possible
+  //       with 'pure' attribute (of course, not in strict FP mode).
+  auto libCall = builder.create<fir::CallOp>(loc, funcOp, args);
+  LLVM_DEBUG(libCall.dump(); llvm::dbgs() << "\n");
+  return libCall.getResult(0);
+}
+
+template <typename T>
+static mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
+                             llvm::StringRef name, mlir::FunctionType funcType,
+                             llvm::ArrayRef<mlir::Value> args) {
+  // TODO: we have to annotate the math operations with flags
+  //       that will allow to define FP accuracy/exception
+  //       behavior per operation, so that after early multi-module
+  //       MLIR inlining we can distiguish operation that were
+  //       compiled with different settings.
+  //       Suggestion:
+  //         * For "relaxed" FP mode set all Fast-Math Flags
+  //           (see "[RFC] FastMath flags support in MLIR (arith dialect)"
+  //           topic at discourse.llvm.org).
+  //         * For "fast" FP mode set all Fast-Math Flags except 'afn'.
+  //         * For "precise/strict" FP mode generate fir.calls to libm
+  //           entries and annotate them with an attribute that will
+  //           end up transformed into 'strictfp' LLVM attribute (TBD).
+  //           Elsewhere, "precise/strict" FP mode should also set
+  //           'strictfp' for all user functions and calls so that
+  //           LLVM backend does the right job.
+  //         * Operations that cannot be reasonably optimized in MLIR
+  //           can be also lowered to libm calls for "fast" and "relaxed"
+  //           modes.
+  mlir::Value result;
+  if (mathRuntimeVersion == preciseVersion) {
+    result = genLibCall(builder, loc, name, funcType, args);
+  } else {
+    LLVM_DEBUG(llvm::dbgs()
+                   << "Generating '" << name << "' operation with type ";
+               funcType.dump(); llvm::dbgs() << "\n");
+    result = builder.create<T>(loc, args);
+  }
+  LLVM_DEBUG(result.dump(); llvm::dbgs() << "\n");
+  return result;
+}
+
+/// Map mathematical intrinsic operations into MLIR operations
+/// of some appropriate dialect (math, complex, etc.) or libm
+/// calls.
+/// TODO: support more operations here.
+static constexpr MathOperation mathOperations[] = {
+    {"abs", "fabsf", genF32F32FuncType, genMathOp<mlir::math::AbsOp>},
+    {"abs", "fabs", genF64F64FuncType, genMathOp<mlir::math::AbsOp>},
+    // llvm.trunc behaves the same way as libm's trunc.
+    {"aint", "llvm.trunc.f32", genF32F32FuncType, genLibCall},
+    {"aint", "llvm.trunc.f64", genF64F64FuncType, genLibCall},
+    // llvm.round behaves the same way as libm's round.
+    {"anint", "llvm.round.f32", genF32F32FuncType,
+     genMathOp<mlir::LLVM::RoundOp>},
+    {"anint", "llvm.round.f64", genF64F64FuncType,
+     genMathOp<mlir::LLVM::RoundOp>},
+    {"atan", "atanf", genF32F32FuncType, genMathOp<mlir::math::AtanOp>},
+    {"atan", "atan", genF64F64FuncType, genMathOp<mlir::math::AtanOp>},
+    {"atan2", "atan2f", genF32F32F32FuncType, genMathOp<mlir::math::Atan2Op>},
+    {"atan2", "atan2", genF64F64F64FuncType, genMathOp<mlir::math::Atan2Op>},
+    // math::CeilOp returns a real, while Fortran CEILING returns integer.
+    {"ceil", "ceilf", genF32F32FuncType, genMathOp<mlir::math::CeilOp>},
+    {"ceil", "ceil", genF64F64FuncType, genMathOp<mlir::math::CeilOp>},
+    {"cos", "cosf", genF32F32FuncType, genMathOp<mlir::math::CosOp>},
+    {"cos", "cos", genF64F64FuncType, genMathOp<mlir::math::CosOp>},
+    {"erf", "erff", genF32F32FuncType, genMathOp<mlir::math::ErfOp>},
+    {"erf", "erf", genF64F64FuncType, genMathOp<mlir::math::ErfOp>},
+    {"exp", "expf", genF32F32FuncType, genMathOp<mlir::math::ExpOp>},
+    {"exp", "exp", genF64F64FuncType, genMathOp<mlir::math::ExpOp>},
+    // math::FloorOp returns a real, while Fortran FLOOR returns integer.
+    {"floor", "floorf", genF32F32FuncType, genMathOp<mlir::math::FloorOp>},
+    {"floor", "floor", genF64F64FuncType, genMathOp<mlir::math::FloorOp>},
+    {"hypot", "hypotf", genF32F32F32FuncType, genLibCall},
+    {"hypot", "hypot", genF64F64F64FuncType, genLibCall},
+    {"log", "logf", genF32F32FuncType, genMathOp<mlir::math::LogOp>},
+    {"log", "log", genF64F64FuncType, genMathOp<mlir::math::LogOp>},
+    {"log10", "log10f", genF32F32FuncType, genMathOp<mlir::math::Log10Op>},
+    {"log10", "log10", genF64F64FuncType, genMathOp<mlir::math::Log10Op>},
+    // llvm.lround behaves the same way as libm's lround.
+    {"nint", "llvm.lround.i64.f64", genIntF64FuncType<64>, genLibCall},
+    {"nint", "llvm.lround.i64.f32", genIntF32FuncType<64>, genLibCall},
+    {"nint", "llvm.lround.i32.f64", genIntF64FuncType<32>, genLibCall},
+    {"nint", "llvm.lround.i32.f32", genIntF32FuncType<32>, genLibCall},
+    {"pow", "powf", genF32F32F32FuncType, genMathOp<mlir::math::PowFOp>},
+    {"pow", "pow", genF64F64F64FuncType, genMathOp<mlir::math::PowFOp>},
+    // TODO: add PowIOp in math and complex dialects.
+    {"pow", "llvm.powi.f32.i32", genF32F32IntFuncType<32>, genLibCall},
+    {"pow", "llvm.powi.f64.i32", genF64F64IntFuncType<32>, genLibCall},
+    {"sign", "copysignf", genF32F32F32FuncType,
+     genMathOp<mlir::math::CopySignOp>},
+    {"sign", "copysign", genF64F64F64FuncType,
+     genMathOp<mlir::math::CopySignOp>},
+    {"sin", "sinf", genF32F32FuncType, genMathOp<mlir::math::SinOp>},
+    {"sin", "sin", genF64F64FuncType, genMathOp<mlir::math::SinOp>},
+    {"sqrt", "sqrtf", genF32F32FuncType, genMathOp<mlir::math::SqrtOp>},
+    {"sqrt", "sqrt", genF64F64FuncType, genMathOp<mlir::math::SqrtOp>},
+    {"tanh", "tanhf", genF32F32FuncType, genMathOp<mlir::math::TanhOp>},
+    {"tanh", "tanh", genF64F64FuncType, genMathOp<mlir::math::TanhOp>},
+};
+
 // Note: These are also defined as operations in LLVM dialect. See if this
 // can be use and has advantages.
+// TODO: remove this table, since the late math lowering should
+//       replace it and generate proper MLIR operations rather
+//       than llvm intrinsic calls, which still look like generic
+//       calls to MLIR and do not enable many optimizations.
 static constexpr RuntimeFunction llvmIntrinsics[] = {
     {"abs", "llvm.fabs.f32", genF32F32FuncType},
     {"abs", "llvm.fabs.f64", genF64F64FuncType},
@@ -1242,7 +1430,7 @@
 /// function type and that will not imply narrowing arguments or extending the
 /// result.
 /// If nothing is found, the mlir::func::FuncOp will contain a nullptr.
-mlir::func::FuncOp searchFunctionInLibrary(
+static mlir::func::FuncOp searchFunctionInLibrary(
     mlir::Location loc, fir::FirOpBuilder &builder,
     const Fortran::common::StaticMultimapView<RuntimeFunction> &lib,
     llvm::StringRef name, mlir::FunctionType funcType,
@@ -1265,6 +1453,65 @@
   return {};
 }
 
+using RtMap = Fortran::common::StaticMultimapView<MathOperation>;
+static constexpr RtMap mathOps(mathOperations);
+static_assert(mathOps.Verify() && "map must be sorted");
+
+static const MathOperation *
+searchMathOperation(fir::FirOpBuilder &builder, llvm::StringRef name,
+                    mlir::FunctionType funcType,
+                    const MathOperation **bestNearMatch,
+                    FunctionDistance &bestMatchDistance) {
+  auto range = mathOps.equal_range(name);
+  for (auto iter = range.first; iter != range.second && iter; ++iter) {
+    const auto &impl = *iter;
+    auto implType = impl.typeGenerator(builder.getContext());
+    if (funcType == implType)
+      return &impl; // exact match
+
+    FunctionDistance distance(funcType, implType);
+    if (distance.isSmallerThan(bestMatchDistance)) {
+      *bestNearMatch = &impl;
+      bestMatchDistance = std::move(distance);
+    }
+  }
+  return nullptr;
+}
+
+/// Implementation of the operation defined by \p name with type
+/// \p funcType is not precise, and the actual available implementation
+/// is \p distance away from the requested. If using the available
+/// implementation results in a precision loss, emit an error message
+/// with the given code location \p loc.
+static void diagPrecisionLoss(llvm::StringRef name, mlir::FunctionType funcType,
+                              const FunctionDistance &distance,
+                              mlir::Location loc) {
+  if (!distance.isLosingPrecision())
+    return;
+
+  // Using this runtime version requires narrowing the arguments
+  // or extending the result. It is not numerically safe. There
+  // is currently no quad math library that was described in
+  // lowering and could be used here. Emit an error and continue
+  // generating the code with the narrowing cast so that the user
+  // can get a complete list of the problematic intrinsic calls.
+  std::string message("TODO: no math runtime available for '");
+  llvm::raw_string_ostream sstream(message);
+  if (name == "pow") {
+    assert(funcType.getNumInputs() == 2 && "power operator has two arguments");
+    sstream << funcType.getInput(0) << " ** " << funcType.getInput(1);
+  } else {
+    sstream << name << "(";
+    if (funcType.getNumInputs() > 0)
+      sstream << funcType.getInput(0);
+    for (mlir::Type argType : funcType.getInputs().drop_front())
+      sstream << ", " << argType;
+    sstream << ")";
+  }
+  sstream << "'";
+  mlir::emitError(loc, message);
+}
+
 /// Search runtime for the best runtime function given an intrinsic name
 /// and interface. The interface may not be a perfect match in which case
 /// the caller is responsible to insert argument and return value conversions.
@@ -1283,6 +1530,7 @@
   static_assert(pgmathR.Verify() && "map must be sorted");
   static constexpr RtMap pgmathP(pgmathPrecise);
   static_assert(pgmathP.Verify() && "map must be sorted");
+
   if (mathRuntimeVersion == fastVersion) {
     match = searchFunctionInLibrary(loc, builder, pgmathF, name, funcType,
                                     &bestNearMatch, bestMatchDistance);
@@ -1308,30 +1556,7 @@
     return exactMatch;
 
   if (bestNearMatch != nullptr) {
-    if (bestMatchDistance.isLosingPrecision()) {
-      // Using this runtime version requires narrowing the arguments
-      // or extending the result. It is not numerically safe. There
-      // is currently no quad math library that was described in
-      // lowering and could be used here. Emit an error and continue
-      // generating the code with the narrowing cast so that the user
-      // can get a complete list of the problematic intrinsic calls.
-      std::string message("TODO: no math runtime available for '");
-      llvm::raw_string_ostream sstream(message);
-      if (name == "pow") {
-        assert(funcType.getNumInputs() == 2 &&
-               "power operator has two arguments");
-        sstream << funcType.getInput(0) << " ** " << funcType.getInput(1);
-      } else {
-        sstream << name << "(";
-        if (funcType.getNumInputs() > 0)
-          sstream << funcType.getInput(0);
-        for (mlir::Type argType : funcType.getInputs().drop_front())
-          sstream << ", " << argType;
-        sstream << ")";
-      }
-      sstream << "'";
-      mlir::emitError(loc, message);
-    }
+    diagPrecisionLoss(name, funcType, bestMatchDistance, loc);
     return getFuncOp(loc, builder, *bestNearMatch);
   }
   return {};
@@ -1540,7 +1765,7 @@
   IntrinsicLibrary::RuntimeCallGenerator runtimeCallGenerator =
       getRuntimeCallGenerator(name, soughtFuncType);
   return genElementalCall(runtimeCallGenerator, name, *resultType, args,
-                          /* outline */ true);
+                          /*outline=*/outlineAllIntrinsics);
 }
 
 mlir::Value
@@ -1692,29 +1917,58 @@
 IntrinsicLibrary::RuntimeCallGenerator
 IntrinsicLibrary::getRuntimeCallGenerator(llvm::StringRef name,
                                           mlir::FunctionType soughtFuncType) {
-  mlir::func::FuncOp funcOp =
-      getRuntimeFunction(loc, builder, name, soughtFuncType);
-  if (!funcOp) {
+  mlir::func::FuncOp funcOp;
+  mlir::FunctionType actualFuncType;
+  const MathOperation *mathOp = nullptr;
+  if (mathLowering == lateLowering) {
+    // Look for a dedicated math operation generator, which
+    // normally produces a single MLIR operation implementing
+    // the math operation.
+    // If not found fall back to a runtime function lookup.
+    const MathOperation *bestNearMatch = nullptr;
+    FunctionDistance bestMatchDistance;
+    mathOp = searchMathOperation(builder, name, soughtFuncType, &bestNearMatch,
+                                 bestMatchDistance);
+    if (!mathOp && bestNearMatch) {
+      // Use the best near match, optionally issuing an error,
+      // if types conversions cause precision loss.
+      diagPrecisionLoss(name, soughtFuncType, bestMatchDistance, loc);
+      mathOp = bestNearMatch;
+    }
+    if (mathOp)
+      actualFuncType = mathOp->typeGenerator(builder.getContext());
+  }
+  if (!mathOp)
+    if ((funcOp = getRuntimeFunction(loc, builder, name, soughtFuncType)))
+      actualFuncType = funcOp.getFunctionType();
+
+  if (!mathOp && !funcOp) {
     std::string buffer("not yet implemented: missing intrinsic lowering: ");
     llvm::raw_string_ostream sstream(buffer);
     sstream << name << "\nrequested type was: " << soughtFuncType << '\n';
     fir::emitFatalError(loc, buffer);
   }
 
-  mlir::FunctionType actualFuncType = funcOp.getFunctionType();
   assert(actualFuncType.getNumResults() == soughtFuncType.getNumResults() &&
          actualFuncType.getNumInputs() == soughtFuncType.getNumInputs() &&
          actualFuncType.getNumResults() == 1 && "Bad intrinsic match");
 
-  return [funcOp, actualFuncType,
+  return [funcOp, actualFuncType, mathOp,
           soughtFuncType](fir::FirOpBuilder &builder, mlir::Location loc,
                           llvm::ArrayRef<mlir::Value> args) {
     llvm::SmallVector<mlir::Value> convertedArguments;
     for (auto [fst, snd] : llvm::zip(actualFuncType.getInputs(), args))
       convertedArguments.push_back(builder.createConvert(loc, fst, snd));
-    auto call = builder.create<fir::CallOp>(loc, funcOp, convertedArguments);
+    mlir::Value result;
+    // Use math operation generator, if available.
+    if (mathOp)
+      result = mathOp->funcGenerator(builder, loc, mathOp->symbol,
+                                     actualFuncType, convertedArguments);
+    else
+      result = builder.create<fir::CallOp>(loc, funcOp, convertedArguments)
+                   .getResult(0);
     mlir::Type soughtType = soughtFuncType.getResult(0);
-    return builder.createConvert(loc, soughtType, call.getResult(0));
+    return builder.createConvert(loc, soughtType, result);
   };
 }
 
@@ -1880,7 +2134,7 @@
                                        llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 1);
   return fir::factory::Complex{builder, loc}.extractComplexPart(
-      args[0], true /* isImagPart */);
+      args[0], /*isImagPart=*/true);
 }
 
 // AINT
@@ -3896,6 +4150,13 @@
 mlir::Value Fortran::lower::genPow(fir::FirOpBuilder &builder,
                                    mlir::Location loc, mlir::Type type,
                                    mlir::Value x, mlir::Value y) {
+  // TODO: since there is no libm version of pow with integer exponent,
+  //       we have to provide an alternative implementation for
+  //       "precise/strict" FP mode and (mathLowering == lateLowering).
+  //       One option is to generate internal function with inlined
+  //       implementation and mark it 'strictfp'.
+  //       Another option is to implement it in Fortran runtime library
+  //       (just like matmul).
   return IntrinsicLibrary{builder, loc}.genRuntimeCall("pow", type, {x, y});
 }
 
diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
--- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -17,6 +17,8 @@
   FIRBuilder
   FIRDialect
   FIRSupport
+  MLIRMathToLLVM
+  MLIRMathToLibm
   MLIROpenMPToLLVM
   MLIRLLVMToLLVMIRTranslation
   MLIRTargetLLVMIRExport
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -23,6 +23,8 @@
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/MathToLibm/MathToLibm.h"
 #include "mlir/Conversion/OpenMPToLLVM/ConvertOpenMPToLLVM.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"
@@ -3380,6 +3382,10 @@
                                                             pattern);
     mlir::cf::populateControlFlowToLLVMConversionPatterns(typeConverter,
                                                           pattern);
+    // Convert math-like dialect operations, which can be produced
+    // when late math lowering mode is used, into llvm dialect.
+    mlir::populateMathToLLVMConversionPatterns(typeConverter, pattern);
+    mlir::populateMathToLibmConversionPatterns(pattern, /*benefit=*/0);
     mlir::ConversionTarget target{*context};
     target.addLegalDialect<mlir::LLVM::LLVMDialect>();
     // The OpenMP dialect is legal for Operations without regions, for those
diff --git a/flang/test/Intrinsics/late-math-codegen.f90 b/flang/test/Intrinsics/late-math-codegen.f90
new file mode 100644
--- /dev/null
+++ b/flang/test/Intrinsics/late-math-codegen.f90
@@ -0,0 +1,178 @@
+! TODO: verify that Fast-Math Flags and 'strictfp' are properly set.
+! RUN: bbc -emit-fir %s -o - --math-lowering=late --math-runtime=fast | fir-opt --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" | FileCheck --check-prefixes=ALL,FAST %s
+! RUN: bbc -emit-fir %s -o - --math-lowering=late --math-runtime=relaxed | fir-opt --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" | FileCheck --check-prefixes=ALL,RELAXED %s
+! RUN: bbc -emit-fir %s -o - --math-lowering=late --math-runtime=precise | fir-opt --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" | FileCheck --check-prefixes=ALL,PRECISE %s
+
+! ALL-LABEL: @_QPtest_real4
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.fabs"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.fabs"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @fabsf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @hypotf({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @hypotf({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @hypotf({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.trunc.f32({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.round"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.round"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.round.f32({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @atanf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @atanf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @atanf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @atan2f({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @atan2f({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @atan2f({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.ceil"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.ceil"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @ceilf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.cos"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.cos"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @cosf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @erff({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @erff({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @erff({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.exp"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.exp"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @expf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.floor"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.floor"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @floorf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.log"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.log"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @logf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.log10"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.log10"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @log10f({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.lround.i32.f32({{%[A-Za-z0-9._]+}}) : (f32) -> i32
+
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.lround.i64.f32({{%[A-Za-z0-9._]+}}) : (f32) -> i64
+
+! ALL: [[STOI:%[A-Za-z0-9._]+]] = llvm.sext {{%[A-Za-z0-9._]+}} : i16 to i32
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.powi.f32.i32({{%[A-Za-z0-9._]+}}, [[STOI]]) : (f32, i32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.pow"({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.pow"({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @powf({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.powi.f32.i32({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, i32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.copysign"({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.copysign"({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @copysignf({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.sin"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.sin"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @sinf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @tanhf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @tanhf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @tanhf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+function test_real4(x, y, c, s, i)
+  real :: x, y, test_real4
+  complex(4) :: c
+  integer(2) :: s
+  integer(4) :: i
+  test_real4 = abs(x) + abs(c) + aint(x) + anint(x) + atan(x) + atan2(x, y) + &
+       ceiling(x) + cos(x) + erf(x) + exp(x) + floor(x) + log(x) + log10(x) + &
+       nint(x, 4) + nint(x, 8) + x ** s + x ** y + x ** i + sign(x, y) + &
+       sin(x) + tanh(x)
+end function
+
+! ALL-LABEL: @_QPtest_real8
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.fabs"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.fabs"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @fabs({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @hypot({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @hypot({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @hypot({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.trunc.f64({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.round"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.round"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.round.f64({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @atan({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @atan({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @atan({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @atan2({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @atan2({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @atan2({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.ceil"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.ceil"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @ceil({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.cos"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.cos"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @cos({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @erf({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @erf({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @erf({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.exp"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.exp"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @exp({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.floor"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.floor"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @floor({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.log"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.log"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @log({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.log10"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.log10"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @log10({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.lround.i32.f64({{%[A-Za-z0-9._]+}}) : (f64) -> i32
+
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.lround.i64.f64({{%[A-Za-z0-9._]+}}) : (f64) -> i64
+
+! ALL: [[STOI:%[A-Za-z0-9._]+]] = llvm.sext {{%[A-Za-z0-9._]+}} : i16 to i32
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.powi.f64.i32({{%[A-Za-z0-9._]+}}, [[STOI]]) : (f64, i32) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.pow"({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.pow"({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @pow({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+
+! ALL: {{%[A-Za-z0-9._]+}} = llvm.call @llvm.powi.f64.i32({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, i32) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.copysign"({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.copysign"({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @copysign({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.sin"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.sin"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @sin({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+! FAST: {{%[A-Za-z0-9._]+}} = llvm.call @tanh({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = llvm.call @tanh({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = llvm.call @tanh({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+function test_real8(x, y, c, s, i)
+  real(8) :: x, y, test_real8
+  complex(8) :: c
+  integer(2) :: s
+  integer(4) :: i
+  test_real8 = abs(x) + abs(c) + aint(x) + anint(x) + atan(x) + atan2(x, y) + &
+       ceiling(x) + cos(x) + erf(x) + exp(x) + floor(x) + log(x) + log10(x) + &
+       nint(x, 4) + nint(x, 8) + x ** s + x ** y + x ** i + sign(x, y) + &
+       sin(x) + tanh(x)
+end function
diff --git a/flang/test/Lower/Intrinsics/exp.f90 b/flang/test/Lower/Intrinsics/exp.f90
--- a/flang/test/Lower/Intrinsics/exp.f90
+++ b/flang/test/Lower/Intrinsics/exp.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -outline-intrinsics %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -mllvm -outline-intrinsics %s -o - | FileCheck %s
 
 ! CHECK-LABEL: exp_testr
 ! CHECK-SAME: (%[[AREF:.*]]: !fir.ref<f32> {{.*}}, %[[BREF:.*]]: !fir.ref<f32> {{.*}})
diff --git a/flang/test/Lower/Intrinsics/log.f90 b/flang/test/Lower/Intrinsics/log.f90
--- a/flang/test/Lower/Intrinsics/log.f90
+++ b/flang/test/Lower/Intrinsics/log.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -outline-intrinsics %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -mllvm -outline-intrinsics %s -o - | FileCheck %s
 
 ! CHECK-LABEL: log_testr
 ! CHECK-SAME: (%[[AREF:.*]]: !fir.ref<f32> {{.*}}, %[[BREF:.*]]: !fir.ref<f32> {{.*}})
diff --git a/flang/test/Lower/Intrinsics/math-runtime-options.f90 b/flang/test/Lower/Intrinsics/math-runtime-options.f90
--- a/flang/test/Lower/Intrinsics/math-runtime-options.f90
+++ b/flang/test/Lower/Intrinsics/math-runtime-options.f90
@@ -1,7 +1,7 @@
-! RUN: bbc -emit-fir --math-runtime=fast %s -o - | FileCheck %s --check-prefixes="FIR,FAST"
-! RUN: bbc -emit-fir --math-runtime=relaxed %s -o - | FileCheck %s --check-prefixes="FIR,RELAXED"
-! RUN: bbc -emit-fir --math-runtime=precise %s -o - | FileCheck %s --check-prefixes="FIR,PRECISE"
-! RUN: bbc -emit-fir --math-runtime=llvm %s -o - | FileCheck %s --check-prefixes="FIR,LLVM"
+! RUN: bbc -emit-fir --math-runtime=fast -outline-intrinsics %s -o - | FileCheck %s --check-prefixes="FIR,FAST"
+! RUN: bbc -emit-fir --math-runtime=relaxed -outline-intrinsics %s -o - | FileCheck %s --check-prefixes="FIR,RELAXED"
+! RUN: bbc -emit-fir --math-runtime=precise -outline-intrinsics %s -o - | FileCheck %s --check-prefixes="FIR,PRECISE"
+! RUN: bbc -emit-fir --math-runtime=llvm -outline-intrinsics %s -o - | FileCheck %s --check-prefixes="FIR,LLVM"
 
 ! CHECK-LABEL: cos_testr
 subroutine cos_testr(a, b)
diff --git a/flang/test/Lower/late-math-lowering.f90 b/flang/test/Lower/late-math-lowering.f90
new file mode 100644
--- /dev/null
+++ b/flang/test/Lower/late-math-lowering.f90
@@ -0,0 +1,134 @@
+! RUN: bbc -emit-fir %s -o - --math-lowering=late --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s
+! 'relaxed' matches 'fast' exactly right now, but this will change:
+! RUN: bbc -emit-fir %s -o - --math-lowering=late --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s
+! RUN: bbc -emit-fir %s -o - --math-lowering=late --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s
+
+! ALL-LABEL: @_QPtest_real4
+! FAST: {{%[A-Za-z0-9._]+}} = math.abs {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.abs {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @fabsf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @hypotf({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.trunc.f32({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.round"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.round"({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @llvm.round.f32({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.atan {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.atan {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @atanf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.atan2 {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.atan2 {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @atan2f({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.ceil {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.ceil {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @ceilf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.cos {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.cos {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @cosf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.erf {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.erf {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @erff({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.exp {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.exp {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @expf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.floor {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.floor {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @floorf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.log {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.log {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @logf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.log10 {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.log10 {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @log10f({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.lround.i32.f32({{%[A-Za-z0-9._]+}}) : (f32) -> i32
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.lround.i64.f32({{%[A-Za-z0-9._]+}}) : (f32) -> i64
+! ALL: [[STOI:%[A-Za-z0-9._]+]] = fir.convert {{%[A-Za-z0-9._]+}} : (i16) -> i32
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.powi.f32.i32({{%[A-Za-z0-9._]+}}, [[STOI]]) : (f32, i32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.powf {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.powf {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @powf({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.powi.f32.i32({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, i32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @copysignf({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f32, f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.sin {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.sin {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @sinf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.tanh {{%[A-Za-z0-9._]+}} : f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.tanh {{%[A-Za-z0-9._]+}} : f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @tanhf({{%[A-Za-z0-9._]+}}) : (f32) -> f32
+
+function test_real4(x, y, c, s, i)
+  real :: x, y, test_real4
+  complex(4) :: c
+  integer(2) :: s
+  integer(4) :: i
+  test_real4 = abs(x) + abs(c) + aint(x) + anint(x) + atan(x) + atan2(x, y) + &
+       ceiling(x) + cos(x) + erf(x) + exp(x) + floor(x) + log(x) + log10(x) + &
+       nint(x, 4) + nint(x, 8) + x ** s + x ** y + x ** i + sign(x, y) + &
+       sin(x) + tanh(x)
+end function
+
+! ALL-LABEL: @_QPtest_real8
+! FAST: {{%[A-Za-z0-9._]+}} = math.abs {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.abs {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @fabs({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @hypot({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.trunc.f64({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = "llvm.intr.round"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = "llvm.intr.round"({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @llvm.round.f64({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.atan {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.atan {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @atan({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.atan2 {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.atan2 {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @atan2({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.ceil {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.ceil {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @ceil({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.cos {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.cos {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @cos({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.erf {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.erf {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @erf({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.exp {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.exp {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @exp({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.floor {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.floor {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @floor({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.log {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.log {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @log({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.log10 {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.log10 {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @log10({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.lround.i32.f64({{%[A-Za-z0-9._]+}}) : (f64) -> i32
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.lround.i64.f64({{%[A-Za-z0-9._]+}}) : (f64) -> i64
+! ALL: [[STOI:%[A-Za-z0-9._]+]] = fir.convert {{%[A-Za-z0-9._]+}} : (i16) -> i32
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.powi.f64.i32({{%[A-Za-z0-9._]+}}, [[STOI]]) : (f64, i32) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.powf {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.powf {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @pow({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! ALL: {{%[A-Za-z0-9._]+}} = fir.call @llvm.powi.f64.i32({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, i32) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.copysign {{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @copysign({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (f64, f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.sin {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.sin {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @sin({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.tanh {{%[A-Za-z0-9._]+}} : f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.tanh {{%[A-Za-z0-9._]+}} : f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @tanh({{%[A-Za-z0-9._]+}}) : (f64) -> f64
+
+function test_real8(x, y, c, s, i)
+  real(8) :: x, y, test_real8
+  complex(8) :: c
+  integer(2) :: s
+  integer(4) :: i
+  test_real8 = abs(x) + abs(c) + aint(x) + anint(x) + atan(x) + atan2(x, y) + &
+       ceiling(x) + cos(x) + erf(x) + exp(x) + floor(x) + log(x) + log10(x) + &
+       nint(x, 4) + nint(x, 8) + x ** s + x ** y + x ** i + sign(x, y) + &
+       sin(x) + tanh(x)
+end function
diff --git a/flang/test/Lower/llvm-math.f90 b/flang/test/Lower/llvm-math.f90
--- a/flang/test/Lower/llvm-math.f90
+++ b/flang/test/Lower/llvm-math.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir %s -o - --math-runtime=llvm | FileCheck %s
+! RUN: bbc -emit-fir %s -o - --math-runtime=llvm --outline-intrinsics | FileCheck %s
 
       SUBROUTINE POW_WRAPPER(IN, IN2, OUT)
       DOUBLE PRECISION IN, IN2
diff --git a/flang/test/Lower/sqrt.f90 b/flang/test/Lower/sqrt.f90
--- a/flang/test/Lower/sqrt.f90
+++ b/flang/test/Lower/sqrt.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -outline-intrinsics %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -mllvm -outline-intrinsics %s -o - | FileCheck %s
 
 ! CHECK-LABEL: sqrt_testr
 subroutine sqrt_testr(a, b)
diff --git a/flang/test/Lower/trigonometric-intrinsics.f90 b/flang/test/Lower/trigonometric-intrinsics.f90
--- a/flang/test/Lower/trigonometric-intrinsics.f90
+++ b/flang/test/Lower/trigonometric-intrinsics.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -outline-intrinsics %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -mllvm -outline-intrinsics %s -o - | FileCheck %s
 
 ! CHECK-LABEL: atan_testr
 subroutine atan_testr(a, b)
diff --git a/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp b/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp
--- a/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp
+++ b/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp
@@ -143,11 +143,15 @@
   patterns.add<VecOpToScalarOp<math::Atan2Op>, VecOpToScalarOp<math::ExpM1Op>,
                VecOpToScalarOp<math::TanhOp>, VecOpToScalarOp<math::CosOp>,
                VecOpToScalarOp<math::SinOp>, VecOpToScalarOp<math::ErfOp>,
-               VecOpToScalarOp<math::RoundOp>>(patterns.getContext(), benefit);
+               VecOpToScalarOp<math::RoundOp>, VecOpToScalarOp<math::AtanOp>>(
+      patterns.getContext(), benefit);
   patterns.add<PromoteOpToF32<math::Atan2Op>, PromoteOpToF32<math::ExpM1Op>,
                PromoteOpToF32<math::TanhOp>, PromoteOpToF32<math::CosOp>,
                PromoteOpToF32<math::SinOp>, PromoteOpToF32<math::ErfOp>,
-               PromoteOpToF32<math::RoundOp>>(patterns.getContext(), benefit);
+               PromoteOpToF32<math::RoundOp>, PromoteOpToF32<math::AtanOp>>(
+      patterns.getContext(), benefit);
+  patterns.add<ScalarOpToLibmCall<math::AtanOp>>(patterns.getContext(), "atanf",
+                                                 "atan", benefit);
   patterns.add<ScalarOpToLibmCall<math::Atan2Op>>(patterns.getContext(),
                                                   "atan2f", "atan2", benefit);
   patterns.add<ScalarOpToLibmCall<math::ErfOp>>(patterns.getContext(), "erff",
diff --git a/mlir/test/Conversion/MathToLibm/convert-to-libm.mlir b/mlir/test/Conversion/MathToLibm/convert-to-libm.mlir
--- a/mlir/test/Conversion/MathToLibm/convert-to-libm.mlir
+++ b/mlir/test/Conversion/MathToLibm/convert-to-libm.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt %s -convert-math-to-libm -canonicalize | FileCheck %s
 
+// CHECK-DAG: @atan(f64) -> f64
+// CHECK-DAG: @atanf(f32) -> f32
 // CHECK-DAG: @erf(f64) -> f64
 // CHECK-DAG: @erff(f32) -> f32
 // CHECK-DAG: @expm1(f64) -> f64
@@ -15,6 +17,53 @@
 // CHECK-DAG: @sin(f64) -> f64
 // CHECK-DAG: @sinf(f32) -> f32
 
+// CHECK-LABEL: func @atan_caller
+// CHECK-SAME: %[[FLOAT:.*]]: f32
+// CHECK-SAME: %[[DOUBLE:.*]]: f64
+// CHECK-SAME: %[[HALF:.*]]: f16
+// CHECK-SAME: %[[BFLOAT:.*]]: bf16
+func.func @atan_caller(%float: f32, %double: f64, %half: f16, %bfloat: bf16) -> (f32, f64, f16, bf16)  {
+  // CHECK: %[[FLOAT_RESULT:.*]] = call @atanf(%[[FLOAT]]) : (f32) -> f32
+  %float_result = math.atan %float : f32
+  // CHECK: %[[DOUBLE_RESULT:.*]] = call @atan(%[[DOUBLE]]) : (f64) -> f64
+  %double_result = math.atan %double : f64
+  // CHECK: %[[HALF_PROMOTED:.*]] = arith.extf %[[HALF]] : f16 to f32
+  // CHECK: %[[HALF_CALL:.*]] = call @atanf(%[[HALF_PROMOTED]]) : (f32) -> f32
+  // CHECK: %[[HALF_RESULT:.*]] = arith.truncf %[[HALF_CALL]] : f32 to f16
+  %half_result = math.atan %half : f16
+  // CHECK: %[[BFLOAT_PROMOTED:.*]] = arith.extf %[[BFLOAT]] : bf16 to f32
+  // CHECK: %[[BFLOAT_CALL:.*]] = call @atanf(%[[BFLOAT_PROMOTED]]) : (f32) -> f32
+  // CHECK: %[[BFLOAT_RESULT:.*]] = arith.truncf %[[BFLOAT_CALL]] : f32 to bf16
+  %bfloat_result = math.atan %bfloat : bf16
+  // CHECK: return %[[FLOAT_RESULT]], %[[DOUBLE_RESULT]], %[[HALF_RESULT]], %[[BFLOAT_RESULT]]
+  return %float_result, %double_result, %half_result, %bfloat_result : f32, f64, f16, bf16
+}
+
+// CHECK-LABEL:   func @atan_vec_caller(
+// CHECK-SAME:                           %[[VAL_0:.*]]: vector<2xf32>,
+// CHECK-SAME:                           %[[VAL_1:.*]]: vector<2xf64>) -> (vector<2xf32>, vector<2xf64>) {
+// CHECK-DAG:       %[[CVF:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK-DAG:       %[[CVD:.*]] = arith.constant dense<0.000000e+00> : vector<2xf64>
+// CHECK:           %[[IN0_F32:.*]] = vector.extract %[[VAL_0]][0] : vector<2xf32>
+// CHECK:           %[[OUT0_F32:.*]] = call @atanf(%[[IN0_F32]]) : (f32) -> f32
+// CHECK:           %[[VAL_8:.*]] = vector.insert %[[OUT0_F32]], %[[CVF]] [0] : f32 into vector<2xf32>
+// CHECK:           %[[IN1_F32:.*]] = vector.extract %[[VAL_0]][1] : vector<2xf32>
+// CHECK:           %[[OUT1_F32:.*]] = call @atanf(%[[IN1_F32]]) : (f32) -> f32
+// CHECK:           %[[VAL_11:.*]] = vector.insert %[[OUT1_F32]], %[[VAL_8]] [1] : f32 into vector<2xf32>
+// CHECK:           %[[IN0_F64:.*]] = vector.extract %[[VAL_1]][0] : vector<2xf64>
+// CHECK:           %[[OUT0_F64:.*]] = call @atan(%[[IN0_F64]]) : (f64) -> f64
+// CHECK:           %[[VAL_14:.*]] = vector.insert %[[OUT0_F64]], %[[CVD]] [0] : f64 into vector<2xf64>
+// CHECK:           %[[IN1_F64:.*]] = vector.extract %[[VAL_1]][1] : vector<2xf64>
+// CHECK:           %[[OUT1_F64:.*]] = call @atan(%[[IN1_F64]]) : (f64) -> f64
+// CHECK:           %[[VAL_17:.*]] = vector.insert %[[OUT1_F64]], %[[VAL_14]] [1] : f64 into vector<2xf64>
+// CHECK:           return %[[VAL_11]], %[[VAL_17]] : vector<2xf32>, vector<2xf64>
+// CHECK:         }
+func.func @atan_vec_caller(%float: vector<2xf32>, %double: vector<2xf64>) -> (vector<2xf32>, vector<2xf64>) {
+  %float_result = math.atan %float : vector<2xf32>
+  %double_result = math.atan %double : vector<2xf64>
+  return %float_result, %double_result : vector<2xf32>, vector<2xf64>
+}
+
 // CHECK-LABEL: func @tanh_caller
 // CHECK-SAME: %[[FLOAT:.*]]: f32
 // CHECK-SAME: %[[DOUBLE:.*]]: f64