diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -92,6 +92,7 @@
 FEATURE(thread_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Thread))
 FEATURE(dataflow_sanitizer, LangOpts.Sanitize.has(SanitizerKind::DataFlow))
 FEATURE(scudo, LangOpts.Sanitize.hasOneOf(SanitizerKind::Scudo))
+FEATURE(numericalstability_sanitizer, LangOpts.Sanitize.has(SanitizerKind::NumericalStability))
 // Objective-C features
 FEATURE(objc_arr, LangOpts.ObjCAutoRefCount) // FIXME: REMOVE?
 FEATURE(objc_arc, LangOpts.ObjCAutoRefCount)
diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def
--- a/clang/include/clang/Basic/Sanitizers.def
+++ b/clang/include/clang/Basic/Sanitizers.def
@@ -73,6 +73,9 @@
 // ThreadSanitizer
 SANITIZER("thread", Thread)
 
+// Numerical stability sanitizer.
+SANITIZER("numerical", NumericalStability)
+
 // LeakSanitizer
 SANITIZER("leak", Leak)
 
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -86,6 +86,7 @@
   bool needsCfiDiagRt() const;
   bool needsStatsRt() const { return Stats; }
   bool needsScudoRt() const { return Sanitizers.has(SanitizerKind::Scudo); }
+  bool needsNsanRt() const { return Sanitizers.has(SanitizerKind::NumericalStability); }
 
   bool requiresPIE() const;
   bool needsUnwindTables() const;
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -72,6 +72,7 @@
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
 #include "llvm/Transforms/ObjCARC.h"
@@ -359,6 +360,11 @@
   PM.add(createThreadSanitizerLegacyPassPass());
 }
 
+static void addNumericalStabilitySanitizerPass(const PassManagerBuilder &Builder,
+                                               legacy::PassManagerBase &PM) {
+  PM.add(createNumericalStabilitySanitizerLegacyPassPass());
+}
+
 static void addDataFlowSanitizerPass(const PassManagerBuilder &Builder,
                                      legacy::PassManagerBase &PM) {
   const PassManagerBuilderWrapper &BuilderWrapper =
@@ -776,6 +782,13 @@
                            addThreadSanitizerPass);
   }
 
+  if (LangOpts.Sanitize.has(SanitizerKind::NumericalStability)) {
+    PMBuilder.addExtension(PassManagerBuilder::EP_OptimizerLast,
+                           addNumericalStabilitySanitizerPass);
+    PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+                           addNumericalStabilitySanitizerPass);
+  }
+
   if (LangOpts.Sanitize.has(SanitizerKind::DataFlow)) {
     PMBuilder.addExtension(PassManagerBuilder::EP_OptimizerLast,
                            addDataFlowSanitizerPass);
@@ -1107,6 +1120,11 @@
       MPM.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
     }
 
+    if (LangOpts.Sanitize.has(SanitizerKind::NumericalStability)) {
+      MPM.addPass(NumericalStabilitySanitizerPass());
+      MPM.addPass(createModuleToFunctionPassAdaptor(NumericalStabilitySanitizerPass()));
+    }
+
     auto ASanPass = [&](SanitizerMask Mask, bool CompileKernel) {
       if (LangOpts.Sanitize.has(Mask)) {
         bool Recover = CodeGenOpts.SanitizeRecover.has(Mask);
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -411,6 +411,10 @@
       !isInNoSanitizeList(SanitizerKind::Thread, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeThread);
 
+  if (getLangOpts().Sanitize.has(SanitizerKind::NumericalStability) &&
+      !isInNoSanitizeList(SanitizerKind::NumericalStability, Fn, Loc))
+    Fn->addFnAttr(llvm::Attribute::SanitizeNumericalStability);
+
   if (getLangOpts().Sanitize.has(SanitizerKind::Memory) &&
       !isInNoSanitizeList(SanitizerKind::Memory, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeMemory);
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -751,6 +751,8 @@
     Fn->addFnAttr(llvm::Attribute::SanitizeMemTag);
   if (SanOpts.has(SanitizerKind::Thread))
     Fn->addFnAttr(llvm::Attribute::SanitizeThread);
+  if (SanOpts.has(SanitizerKind::NumericalStability))
+    Fn->addFnAttr(llvm::Attribute::SanitizeNumericalStability);
   if (SanOpts.hasOneOf(SanitizerKind::Memory | SanitizerKind::KernelMemory))
     Fn->addFnAttr(llvm::Attribute::SanitizeMemory);
   if (SanOpts.has(SanitizerKind::SafeStack))
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -38,7 +38,7 @@
     SanitizerKind::DataFlow | SanitizerKind::HWAddress | SanitizerKind::Scudo;
 static const SanitizerMask NeedsUnwindTables =
     SanitizerKind::Address | SanitizerKind::HWAddress | SanitizerKind::Thread |
-    SanitizerKind::Memory | SanitizerKind::DataFlow;
+    SanitizerKind::Memory | SanitizerKind::DataFlow | SanitizerKind::NumericalStability;
 static const SanitizerMask SupportsCoverage =
     SanitizerKind::Address | SanitizerKind::HWAddress |
     SanitizerKind::KernelAddress | SanitizerKind::KernelHWAddress |
@@ -49,7 +49,7 @@
     SanitizerKind::DataFlow | SanitizerKind::Fuzzer |
     SanitizerKind::FuzzerNoLink | SanitizerKind::FloatDivideByZero |
     SanitizerKind::SafeStack | SanitizerKind::ShadowCallStack |
-    SanitizerKind::Thread | SanitizerKind::ObjCCast;
+    SanitizerKind::Thread | SanitizerKind::ObjCCast | SanitizerKind::NumericalStability;
 static const SanitizerMask RecoverableByDefault =
     SanitizerKind::Undefined | SanitizerKind::Integer |
     SanitizerKind::ImplicitConversion | SanitizerKind::Nullability |
@@ -143,6 +143,7 @@
                     {"memtag_blacklist.txt", SanitizerKind::MemTag},
                     {"msan_blacklist.txt", SanitizerKind::Memory},
                     {"tsan_blacklist.txt", SanitizerKind::Thread},
+                    {"nsan_blacklist.txt", SanitizerKind::NumericalStability},
                     {"dfsan_abilist.txt", SanitizerKind::DataFlow},
                     {"cfi_blacklist.txt", SanitizerKind::CFI},
                     {"ubsan_blacklist.txt",
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -855,6 +855,8 @@
     if (SanArgs.linkCXXRuntimes())
       StaticRuntimes.push_back("tsan_cxx");
   }
+  if (SanArgs.needsNsanRt() && SanArgs.linkRuntimes())
+    StaticRuntimes.push_back("nsan");
   if (!SanArgs.needsSharedRt() && SanArgs.needsUbsanRt() && SanArgs.linkRuntimes()) {
     if (SanArgs.requiresMinimalRuntime()) {
       StaticRuntimes.push_back("ubsan_minimal");
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -891,8 +891,10 @@
     Res |= SanitizerKind::Leak;
   if (IsX86_64 || IsMIPS64 || IsAArch64 || IsPowerPC64)
     Res |= SanitizerKind::Thread;
-  if (IsX86_64)
+  if (IsX86_64) {
     Res |= SanitizerKind::KernelMemory;
+    Res |= SanitizerKind::NumericalStability;
+  }
   if (IsX86 || IsX86_64)
     Res |= SanitizerKind::Function;
   if (IsX86_64 || IsMIPS64 || IsAArch64 || IsX86 || IsMIPS || IsArmArch ||
diff --git a/clang/runtime/CMakeLists.txt b/clang/runtime/CMakeLists.txt
--- a/clang/runtime/CMakeLists.txt
+++ b/clang/runtime/CMakeLists.txt
@@ -115,7 +115,7 @@
                            COMPONENT compiler-rt)
 
   # Add top-level targets that build specific compiler-rt runtimes.
-  set(COMPILER_RT_RUNTIMES fuzzer asan builtins dfsan lsan msan profile tsan ubsan ubsan-minimal)
+  set(COMPILER_RT_RUNTIMES fuzzer asan builtins dfsan lsan msan nsan profile tsan ubsan ubsan-minimal)
   foreach(runtime ${COMPILER_RT_RUNTIMES})
     get_ext_project_build_command(build_runtime_cmd ${runtime})
     add_custom_target(${runtime}
@@ -132,7 +132,7 @@
 
     # Add top-level targets for various compiler-rt test suites.
     set(COMPILER_RT_TEST_SUITES check-fuzzer check-asan check-hwasan check-asan-dynamic check-dfsan
-      check-lsan check-msan check-sanitizer check-tsan check-ubsan check-ubsan-minimal
+      check-lsan check-msan check-sanitizer check-nsan check-tsan check-ubsan check-ubsan-minimal
       check-profile check-cfi check-cfi-and-supported check-safestack check-gwp_asan)
     foreach(test_suite ${COMPILER_RT_TEST_SUITES})
       get_ext_project_build_command(run_test_suite ${test_suite})
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -325,6 +325,7 @@
   set(ALL_LSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64} ${ARM64} ${ARM32} ${PPC64} ${S390X} ${RISCV64})
 endif()
 set(ALL_MSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64} ${S390X})
+set(ALL_NSAN_SUPPORTED_ARCH ${X86} ${X86_64})
 set(ALL_HWASAN_SUPPORTED_ARCH ${X86_64} ${ARM64})
 set(ALL_MEMPROF_SUPPORTED_ARCH ${X86_64})
 set(ALL_PROFILE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${PPC32} ${PPC64}
@@ -551,6 +552,9 @@
   list_intersect(MSAN_SUPPORTED_ARCH
     ALL_MSAN_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
+  list_intersect(NSAN_SUPPORTED_ARCH
+    ALL_NSAN_SUPPORTED_ARCH
+    SANITIZER_COMMON_SUPPORTED_ARCH)
   list_intersect(HWASAN_SUPPORTED_ARCH
     ALL_HWASAN_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
@@ -618,6 +622,7 @@
   filter_available_targets(SHADOWCALLSTACK_SUPPORTED_ARCH
     ${ALL_SHADOWCALLSTACK_SUPPORTED_ARCH})
   filter_available_targets(GWP_ASAN_SUPPORTED_ARCH ${ALL_GWP_ASAN_SUPPORTED_ARCH})
+  filter_available_targets(NSAN_SUPPORTED_ARCH ${ALL_NSAN_SUPPORTED_ARCH})
 endif()
 
 if (MSVC)
@@ -640,7 +645,7 @@
 endif()
 message(STATUS "Compiler-RT supported architectures: ${COMPILER_RT_SUPPORTED_ARCH}")
 
-set(ALL_SANITIZERS asan;dfsan;msan;hwasan;tsan;safestack;cfi;scudo;ubsan_minimal;gwp_asan)
+set(ALL_SANITIZERS asan;dfsan;msan;hwasan;tsan;safestack;cfi;scudo;ubsan_minimal;gwp_asan;nsan)
 set(COMPILER_RT_SANITIZERS_TO_BUILD all CACHE STRING
     "sanitizers to build if supported on the target (all;${ALL_SANITIZERS})")
 list_replace(COMPILER_RT_SANITIZERS_TO_BUILD all "${ALL_SANITIZERS}")
@@ -803,4 +808,11 @@
 else()
   set(COMPILER_RT_HAS_GWP_ASAN FALSE)
 endif()
+
+if (COMPILER_RT_HAS_SANITIZER_COMMON AND NSAN_SUPPORTED_ARCH AND
+    OS_NAME MATCHES "Linux")
+  set(COMPILER_RT_HAS_NSAN TRUE)
+else()
+  set(COMPILER_RT_HAS_NSAN FALSE)
+endif()
 pythonize_bool(COMPILER_RT_HAS_GWP_ASAN)
diff --git a/compiler-rt/include/sanitizer/nsan_interface.h b/compiler-rt/include/sanitizer/nsan_interface.h
new file mode 100644
--- /dev/null
+++ b/compiler-rt/include/sanitizer/nsan_interface.h
@@ -0,0 +1,75 @@
+//===-- sanitizer/nsan_interface.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Public interface for nsan.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_NSAN_INTERFACE_H
+#define SANITIZER_NSAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// User-provided default option settings.
+///
+/// You can provide your own implementation of this function to return a string
+/// containing NSan runtime options (for example,
+/// <c>verbosity=1:halt_on_error=0</c>).
+///
+/// \returns Default options string.
+const char *__nsan_default_options(void);
+
+// Dumps nsan shadow data for a block of `size_bytes` bytes of application
+// memory at location `addr`.
+//
+// Each line contains application address, shadow types, then values.
+// Unknown types are shown as `__`, while known values are shown as
+// `f`, `d`, `l` for float, double, and long double respectively. Position is
+// shown as a single hex digit. The shadow value itself appears on the line that
+// contains the first byte of the value.
+// FIXME: Show both shadow and application value.
+//
+// Example: `__nsan_dump_shadow_mem(addr, 32, 8, 0)` might print:
+//
+//  0x0add7359:  __ f0 f1 f2 f3 __ __ __   (42.000)
+//  0x0add7361:  __ d1 d2 d3 d4 d5 d6 d7
+//  0x0add7369:  d8 f0 f1 f2 f3 __ __ f2   (-1.000) (12.5)
+//  0x0add7371:  f3 __ __ __ __ __ __ __
+//
+// This means that there is:
+//   - a shadow double for the float at address 0x0add7360, with value 42;
+//   - a shadow float128 for the double at address 0x0add7362, with value -1;
+//   - a shadow double for the float at address 0x0add736a, with value 12.5;
+// There was also a shadow double for the float at address 0x0add736e, but bytes
+// f0 and f1 were overwritten by one or several stores, so that the shadow value
+// is no longer valid.
+// The argument `reserved` can be any value. Its true value is provided by the
+// instrumentation.
+void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes,
+                            size_t bytes_per_line, size_t reserved);
+
+// Explicitly dumps a value.
+// FIXME: vector versions ?
+void __nsan_dump_float(float value);
+void __nsan_dump_double(double value);
+void __nsan_dump_longdouble(long double value);
+
+// Explicitly checks a value.
+// FIXME: vector versions ?
+void __nsan_check_float(float value);
+void __nsan_check_double(double value);
+void __nsan_check_longdouble(long double value);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // SANITIZER_NSAN_INTERFACE_H
diff --git a/compiler-rt/lib/nsan/CMakeLists.txt b/compiler-rt/lib/nsan/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/CMakeLists.txt
@@ -0,0 +1,61 @@
+add_compiler_rt_component(nsan)
+
+include_directories(..)
+
+set(NSAN_SOURCES
+  nsan.cc
+  nsan_flags.cc
+  nsan_interceptors.cc
+  nsan_stats.cc
+  nsan_suppressions.cc
+)
+
+set(NSAN_HEADERS
+  nsan.h
+  nsan_flags.h
+  nsan_flags.inc
+  nsan_platform.h
+  nsan_stats.h
+  nsan_suppressions.h
+)
+
+append_list_if(COMPILER_RT_HAS_FPIC_FLAG -fPIC NSAN_CFLAGS)
+
+set(NSAN_DYNAMIC_LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS})
+
+set(NSAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})
+#-fno-rtti -fno-exceptions
+#    -nostdinc++ -pthread -fno-omit-frame-pointer)
+
+# Remove -stdlib= which is unused when passing -nostdinc++.
+# string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+
+if (COMPILER_RT_HAS_NSAN)
+  foreach(arch ${NSAN_SUPPORTED_ARCH})
+    add_compiler_rt_runtime(
+      clang_rt.nsan
+      STATIC
+      ARCHS ${arch}
+      SOURCES ${NSAN_SOURCES}
+              $<TARGET_OBJECTS:RTInterception.${arch}>
+              $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+              $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+              $<TARGET_OBJECTS:RTSanitizerCommonCoverage.${arch}>
+              $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
+              $<TARGET_OBJECTS:RTUbsan.${arch}>
+      ADDITIONAL_HEADERS ${NSAN_HEADERS}
+      CFLAGS ${NSAN_CFLAGS}
+      PARENT_TARGET nsan
+    )
+  endforeach()
+
+  add_compiler_rt_object_libraries(RTNsan
+      ARCHS ${NSAN_SUPPORTED_ARCH}
+      SOURCES ${NSAN_SOURCES}
+      ADDITIONAL_HEADERS ${NSAN_HEADERS}
+      CFLAGS ${NSAN_CFLAGS})
+endif()
+
+if(COMPILER_RT_INCLUDE_TESTS)
+  add_subdirectory(tests)
+endif()
diff --git a/compiler-rt/lib/nsan/nsan.h b/compiler-rt/lib/nsan/nsan.h
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan.h
@@ -0,0 +1,224 @@
+//===-- nsan.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of NumericalStabilitySanitizer.
+//
+// Private NSan header.
+//===----------------------------------------------------------------------===//
+
+#ifndef NSAN_H
+#define NSAN_H
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+using __sanitizer::sptr;
+using __sanitizer::u16;
+using __sanitizer::uptr;
+
+#include "nsan_platform.h"
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+// Private nsan interface. Used e.g. by interceptors.
+extern "C" {
+
+// This marks the shadow type of the given block of application memory as
+// unknown.
+// printf-free (see comment in nsan_interceptors.cc).
+void __nsan_set_value_unknown(const char *addr, uptr size);
+
+// Copies annotations in the shadow memory for a block of application memory to
+// a new address. This function is used together with memory-copying functions
+// in application memory, e.g. the instrumentation inserts
+// `__nsan_copy_values(dest, src, size)` after builtin calls to
+// `memcpy(dest, src, size)`. Intercepted memcpy calls also call this function.
+// printf-free (see comment in nsan_interceptors.cc).
+void __nsan_copy_values(const char *daddr, const char *saddr, uptr size);
+
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE const char *
+__nsan_default_options();
+}
+
+namespace __nsan {
+
+extern bool NsanInitialized;
+extern bool NsanInitIsRunning;
+
+void initializeInterceptors();
+
+// See notes in nsan_platform.
+// printf-free (see comment in nsan_interceptors.cc).
+inline char *getShadowAddrFor(char *Ptr) {
+  uptr AppOffset = ((uptr)Ptr) & ShadowMask();
+  return (char *)(AppOffset * kShadowScale + ShadowAddr());
+}
+
+// printf-free (see comment in nsan_interceptors.cc).
+inline const char *getShadowAddrFor(const char *Ptr) {
+  return getShadowAddrFor(const_cast<char *>(Ptr));
+}
+
+// printf-free (see comment in nsan_interceptors.cc).
+inline unsigned char *getShadowTypeAddrFor(char *Ptr) {
+  uptr AppOffset = ((uptr)Ptr) & ShadowMask();
+  return (unsigned char *)(AppOffset + TypesAddr());
+}
+
+// printf-free (see comment in nsan_interceptors.cc).
+inline const unsigned char *getShadowTypeAddrFor(const char *Ptr) {
+  return getShadowTypeAddrFor(const_cast<char *>(Ptr));
+}
+
+// Information about value types and their shadow counterparts.
+template <typename FT> struct FTInfo {};
+template <> struct FTInfo<float> {
+  using orig_type = float;
+  using orig_bits_type = __sanitizer::u32;
+  using mantissa_bits_type = __sanitizer::u32;
+  using shadow_type = double;
+  static const char* kCppTypeName;
+  static constexpr unsigned kMantissaBits = 23;
+  static constexpr const int kExponentBits = 8;
+  static constexpr const int kExponentBias = 127;
+  static constexpr const int kValueType = kFloatValueType;
+  static constexpr const char kTypePattern[sizeof(float)] = {
+      static_cast<unsigned char>(kValueType | (0 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (1 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (2 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (3 << kValueSizeSizeBits)),
+  };
+  static constexpr const float kEpsilon = FLT_EPSILON;
+};
+template <> struct FTInfo<double> {
+  using orig_type = double;
+  using orig_bits_type = __sanitizer::u64;
+  using mantissa_bits_type = __sanitizer::u64;
+  using shadow_type = __float128;
+  static const char *kCppTypeName;
+  static constexpr unsigned kMantissaBits = 52;
+  static constexpr const int kExponentBits = 11;
+  static constexpr const int kExponentBias = 1023;
+  static constexpr const int kValueType = kDoubleValueType;
+  static constexpr char kTypePattern[sizeof(double)] = {
+      static_cast<unsigned char>(kValueType | (0 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (1 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (2 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (3 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (4 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (5 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (6 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (7 << kValueSizeSizeBits)),
+  };
+  static constexpr const float kEpsilon = DBL_EPSILON;
+};
+template <> struct FTInfo<long double> {
+  using orig_type = long double;
+  using mantissa_bits_type = __sanitizer::u64;
+  using shadow_type = __float128;
+  static const char* kCppTypeName;
+  static constexpr unsigned kMantissaBits = 63;
+  static constexpr const int kExponentBits = 15;
+  static constexpr const int kExponentBias = (1 << (kExponentBits - 1)) - 1;
+  static constexpr const int kValueType = kFp80ValueType;
+  static constexpr char kTypePattern[sizeof(long double)] = {
+      static_cast<unsigned char>(kValueType | (0 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (1 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (2 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (3 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (4 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (5 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (6 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (7 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (8 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (9 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (10 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (11 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (12 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (13 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (14 << kValueSizeSizeBits)),
+      static_cast<unsigned char>(kValueType | (15 << kValueSizeSizeBits)),
+  };
+  static constexpr const float kEpsilon = LDBL_EPSILON;
+};
+
+template <> struct FTInfo<__float128> {
+  using orig_type = __float128;
+  using orig_bits_type = __uint128_t;
+  using mantissa_bits_type = __uint128_t;
+  static const char* kCppTypeName;
+  static constexpr unsigned kMantissaBits = 112;
+  static constexpr const int kExponentBits = 15;
+  static constexpr const int kExponentBias = (1 << (kExponentBits - 1)) - 1;
+};
+
+constexpr double kMaxULPDiff = INFINITY;
+
+// Helper for getULPDiff that works on bit representations.
+template <typename BT> double getULPDiffBits(BT V1Bits, BT V2Bits) {
+  // If the integer representations of two same-sign floats are subtracted then
+  // the absolute value of the result is equal to one plus the number of
+  // representable floats between them.
+  return V1Bits >= V2Bits ? V1Bits - V2Bits : V2Bits - V1Bits;
+}
+
+// Returns the the number of floating point values between V1 and V2, capped to
+// u64max. Return 0 for (-0.0,0.0).
+template <typename FT> double getULPDiff(FT V1, FT V2) {
+  if (V1 == V2) {
+    return 0; // Typically, -0.0 and 0.0
+  }
+  using BT = typename FTInfo<FT>::orig_bits_type;
+  static_assert(sizeof(FT) == sizeof(BT), "not implemented");
+  static_assert(sizeof(BT) <= 64, "not implemented");
+  BT V1Bits;
+  __builtin_memcpy(&V1Bits, &V1, sizeof(BT));
+  BT V2Bits;
+  __builtin_memcpy(&V2Bits, &V2, sizeof(BT));
+  // Check whether the signs differ. IEEE-754 float types always store the sign
+  // in the most significant bit. NaNs and infinities are handled by the calling
+  // code.
+  constexpr const BT kSignMask = BT{1} << (CHAR_BIT * sizeof(BT) - 1);
+  if ((V1Bits ^ V2Bits) & kSignMask) {
+    // Signs differ. We can get the ULPs as `getULPDiff(negative_number, -0.0)
+    // + getULPDiff(0.0, positive_number)`.
+    if (V1Bits & kSignMask) {
+      return getULPDiffBits<BT>(V1Bits, kSignMask) +
+             getULPDiffBits<BT>(0, V2Bits);
+    } else {
+      return getULPDiffBits<BT>(V2Bits, kSignMask) +
+             getULPDiffBits<BT>(0, V1Bits);
+    }
+  }
+  return getULPDiffBits(V1Bits, V2Bits);
+}
+
+// FIXME: This needs mor work: Because there is no 80-bit integer type, we have
+// to go through __uint128_t. Therefore the assumptions about the sign bit do
+// not hold.
+template <> inline double getULPDiff(long double V1, long double V2) {
+  using BT = __uint128_t;
+  BT V1Bits = 0;
+  __builtin_memcpy(&V1Bits, &V1, sizeof(long double));
+  BT V2Bits = 0;
+  __builtin_memcpy(&V2Bits, &V2, sizeof(long double));
+  if ((V1Bits ^ V2Bits) & (BT{1} << (CHAR_BIT * sizeof(BT) - 1)))
+    return (V1 == V2) ? __sanitizer::u64{0} : kMaxULPDiff; // Signs differ.
+  // If the integer representations of two same-sign floats are subtracted then
+  // the absolute value of the result is equal to one plus the number of
+  // representable floats between them.
+  BT Diff = V1Bits >= V2Bits ? V1Bits - V2Bits : V2Bits - V1Bits;
+  return Diff >= kMaxULPDiff ? kMaxULPDiff : Diff;
+}
+
+} // end namespace __nsan
+
+#endif // NSAN_H
diff --git a/compiler-rt/lib/nsan/nsan.cc b/compiler-rt/lib/nsan/nsan.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan.cc
@@ -0,0 +1,832 @@
+//===-- nsan.cc -----------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// NumericalStabilitySanitizer runtime.
+//
+// This implements:
+//  - The public nsan interface (include/sanitizer/nsan_interface.h).
+//  - The private nsan interface (./nsan.h).
+//  - The internal instrumentation interface. These are function emitted by the
+//    instrumentation pass:
+//        * __nsan_get_shadow_ptr_for_{float,double,longdouble}_load
+//          These return the shadow memory pointer for loading the shadow value,
+//          after checking that the types are consistent. If the types are not
+//          consistent, returns nullptr.
+//        * __nsan_get_shadow_ptr_for_{float,double,longdouble}_store
+//          Sets the shadow types appropriately and returns the shadow memory
+//          pointer for storing the shadow value.
+//        * __nsan_internal_check_{float,double,long double}_{f,d,l} checks the
+//          accuracy of a value against its shadow and emits a warning depending
+//          on the runtime configuration. The middle part indicates the type of
+//          the application value, the suffix (f,d,l) indicates the type of the
+//          shadow, and depends on the instrumentation configuration.
+//        * __nsan_fcmp_fail_* emits a warning for an fcmp instruction whose
+//          corresponding shadow fcmp result differs.
+//
+//===----------------------------------------------------------------------===//
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_common/sanitizer_report_decorator.h"
+#include "sanitizer_common/sanitizer_stacktrace.h"
+#include "sanitizer_common/sanitizer_symbolizer.h"
+
+#include "nsan/nsan.h"
+#include "nsan/nsan_flags.h"
+#include "nsan/nsan_stats.h"
+#include "nsan/nsan_suppressions.h"
+
+using namespace __sanitizer;
+using namespace __nsan;
+
+static constexpr const int kMaxVectorWidth = 8;
+
+// When copying application memory, we also copy its shadow and shadow type.
+// FIXME: We could provide fixed-size versions that would nicely
+// vectorize for known sizes.
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_copy_values(const char *daddr, const char *saddr, uptr size) {
+  internal_memmove((void *)getShadowTypeAddrFor(daddr),
+                   getShadowTypeAddrFor(saddr), size);
+  internal_memmove((void *)getShadowAddrFor(daddr), getShadowAddrFor(saddr),
+                   size * kShadowScale);
+}
+
+// FIXME: We could provide fixed-size versions that would nicely
+// vectorize for known sizes.
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_set_value_unknown(const char *addr, uptr size) {
+  internal_memset((void *)getShadowTypeAddrFor(addr), 0, size);
+}
+
+namespace __nsan {
+
+const char *FTInfo<float>::kCppTypeName = "float";
+const char *FTInfo<double>::kCppTypeName = "double";
+const char *FTInfo<long double>::kCppTypeName = "long double";
+const char *FTInfo<__float128>::kCppTypeName = "__float128";
+
+const char FTInfo<float>::kTypePattern[sizeof(float)];
+const char FTInfo<double>::kTypePattern[sizeof(double)];
+const char FTInfo<long double>::kTypePattern[sizeof(long double)];
+
+// Helper for __nsan_dump_shadow_mem: Reads the value at address `Ptr`,
+// identified by its type id.
+template <typename ShadowFT> __float128 readShadowInternal(const char *Ptr) {
+  ShadowFT Shadow;
+  __builtin_memcpy(&Shadow, Ptr, sizeof(Shadow));
+  return Shadow;
+}
+
+__float128 readShadow(const char *Ptr, const char ShadowTypeId) {
+  switch (ShadowTypeId) {
+  case 'd':
+    return readShadowInternal<double>(Ptr);
+  case 'l':
+    return readShadowInternal<long double>(Ptr);
+  case 'q':
+    return readShadowInternal<__float128>(Ptr);
+  default:
+    return 0.0;
+  }
+}
+
+class Decorator : public __sanitizer::SanitizerCommonDecorator {
+public:
+  Decorator() : SanitizerCommonDecorator() {}
+  const char *Warning() { return Red(); }
+  const char *Name() { return Green(); }
+  const char *End() { return Default(); }
+};
+
+namespace {
+
+// Workaround for the fact that Printf() does not support floats.
+struct PrintBuffer {
+  char Buffer[64];
+};
+template <typename FT> struct FTPrinter {};
+
+template <> struct FTPrinter<double> {
+  static PrintBuffer dec(double Value) {
+    PrintBuffer Result;
+    snprintf(Result.Buffer, sizeof(Result.Buffer) - 1, "%.20f", Value);
+    return Result;
+  }
+  static PrintBuffer hex(double Value) {
+    PrintBuffer Result;
+    snprintf(Result.Buffer, sizeof(Result.Buffer) - 1, "%.20a", Value);
+    return Result;
+  }
+};
+
+template <> struct FTPrinter<float> : FTPrinter<double> {};
+
+template <> struct FTPrinter<long double> {
+  static PrintBuffer dec(long double Value) {
+    PrintBuffer Result;
+    snprintf(Result.Buffer, sizeof(Result.Buffer) - 1, "%.20Lf", Value);
+    return Result;
+  }
+  static PrintBuffer hex(long double Value) {
+    PrintBuffer Result;
+    snprintf(Result.Buffer, sizeof(Result.Buffer) - 1, "%.20La", Value);
+    return Result;
+  }
+};
+
+// FIXME: print with full precision.
+template <> struct FTPrinter<__float128> : FTPrinter<long double> {};
+
+// This is a template so that there are no implicit conversions.
+template <typename FT> inline FT ftAbs(FT V);
+
+template <> inline long double ftAbs(long double V) { return fabsl(V); }
+template <> inline double ftAbs(double V) { return fabs(V); }
+
+// We don't care about nans.
+// std::abs(__float128) code is suboptimal and generates a function call to
+// __getf2().
+template <typename FT> inline FT ftAbs(FT V) { return V >= FT{0} ? V : -V; }
+
+template <typename FT1, typename FT2, bool Enable>
+struct LargestFTImpl {
+  using type = FT2;
+};
+
+template <typename FT1, typename FT2>
+struct LargestFTImpl<FT1, FT2, true> {
+  using type = FT1;
+};
+
+template <typename FT1, typename FT2>
+using LargestFT =
+  typename LargestFTImpl<FT1, FT2, (sizeof(FT1) > sizeof(FT2))>::type;
+
+template<typename T>
+T max(T a, T b) { return a < b ? b : a; }
+
+} // end anonymous namespace
+
+} // end namespace __nsan
+
+void __sanitizer::BufferedStackTrace::UnwindImpl(uptr pc, uptr bp,
+                                                 void *context,
+                                                 bool request_fast,
+                                                 u32 max_depth) {
+  using namespace __nsan;
+  return Unwind(max_depth, pc, bp, context, 0, 0, false);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_print_accumulated_stats() {
+  if (nsan_stats)
+    nsan_stats->print();
+}
+
+static void nsanAtexit() {
+  Printf("Numerical Sanitizer exit stats:\n");
+  __nsan_print_accumulated_stats();
+  nsan_stats = nullptr;
+}
+
+// The next three functions return a pointer for storing a shadow value for `n`
+// values, after setting the shadow types. We return the pointer instead of
+// storing ourselves because it avoids having to rely on the calling convention
+// around long double being the same for nsan and the target application.
+// We have to have 3 versions because we need to know which type we are storing
+// since we are setting the type shadow memory.
+template <typename FT>
+static char *getShadowPtrForStore(char *StoreAddr, uptr N) {
+  unsigned char *ShadowType = getShadowTypeAddrFor(StoreAddr);
+  for (uptr I = 0; I < N; ++I) {
+    __builtin_memcpy(ShadowType + I * sizeof(FT), FTInfo<FT>::kTypePattern,
+                     sizeof(FTInfo<FT>::kTypePattern));
+  }
+  return getShadowAddrFor(StoreAddr);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE char *
+__nsan_get_shadow_ptr_for_float_store(char *store_addr, uptr n) {
+  return getShadowPtrForStore<float>(store_addr, n);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE char *
+__nsan_get_shadow_ptr_for_double_store(char *store_addr, uptr n) {
+  return getShadowPtrForStore<double>(store_addr, n);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE char *
+__nsan_get_shadow_ptr_for_longdouble_store(char *store_addr, uptr n) {
+  return getShadowPtrForStore<long double>(store_addr, n);
+}
+
+template <typename FT>
+static bool isValidShadowType(const unsigned char *ShadowType) {
+  return __builtin_memcmp(ShadowType, FTInfo<FT>::kTypePattern, sizeof(FT)) ==
+         0;
+}
+
+template <int kSize, typename T> static bool isZero(const T *Ptr) {
+  constexpr const char kZeros[kSize] = {}; // Zero initialized.
+  return __builtin_memcmp(Ptr, kZeros, kSize) == 0;
+}
+
+template <typename FT>
+static bool isUnknownShadowType(const unsigned char *ShadowType) {
+  return isZero<sizeof(FTInfo<FT>::kTypePattern)>(ShadowType);
+}
+
+// The three folowing functions check that the address stores a complete
+// shadow value of the given type and return a pointer for loading.
+// They return nullptr if the type of the value is unknown or incomplete.
+template <typename FT>
+static const char *getShadowPtrForLoad(const char *LoadAddr, uptr N) {
+  const unsigned char *const ShadowType = getShadowTypeAddrFor(LoadAddr);
+  for (uptr I = 0; I < N; ++I) {
+    if (!isValidShadowType<FT>(ShadowType + I * sizeof(FT))) {
+      // If loadtracking stats are enabled, log loads with invalid types
+      // (tampered with through type punning).
+      if (flags().enable_loadtracking_stats) {
+        if (isUnknownShadowType<FT>(ShadowType + I * sizeof(FT))) {
+          // Warn only if the value is non-zero. Zero is special because
+          // applications typically initialize large buffers to zero in an
+          // untyped way.
+          if (!isZero<sizeof(FT)>(LoadAddr)) {
+            GET_CALLER_PC_BP;
+            nsan_stats->addUnknownLoadTrackingEvent(pc, bp);
+          }
+        } else {
+          GET_CALLER_PC_BP;
+          nsan_stats->addInvalidLoadTrackingEvent(pc, bp);
+        }
+      }
+      return nullptr;
+    }
+  }
+  return getShadowAddrFor(LoadAddr);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE const char *
+__nsan_get_shadow_ptr_for_float_load(const char *load_addr, uptr n) {
+  return getShadowPtrForLoad<float>(load_addr, n);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE const char *
+__nsan_get_shadow_ptr_for_double_load(const char *load_addr, uptr n) {
+  return getShadowPtrForLoad<double>(load_addr, n);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE const char *
+__nsan_get_shadow_ptr_for_longdouble_load(const char *load_addr, uptr n) {
+  return getShadowPtrForLoad<long double>(load_addr, n);
+}
+
+// Returns the raw shadow pointer. The returned pointer should be considered
+// opaque.
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE char *
+__nsan_internal_get_raw_shadow_ptr(const char *addr) {
+  return getShadowAddrFor(const_cast<char *>(addr));
+}
+
+// Returns the raw shadow type pointer. The returned pointer should be
+// considered opaque.
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE char *
+__nsan_internal_get_raw_shadow_type_ptr(const char *addr) {
+  return reinterpret_cast<char *>(
+      getShadowTypeAddrFor(const_cast<char *>(addr)));
+}
+
+static ValueType getValueType(unsigned char c) {
+  return static_cast<ValueType>(c & 0x3);
+}
+
+static int getValuePos(unsigned char c) { return c >> kValueSizeSizeBits; }
+
+// Checks the consistency of the value types at the given type pointer.
+// If the value is inconsistent, returns ValueType::kUnknown. Else, return the
+// consistent type.
+template <typename FT>
+static bool checkValueConsistency(const unsigned char *ShadowType) {
+  const int Pos = getValuePos(*ShadowType);
+  // Check that all bytes from the start of the value are ordered.
+  for (uptr I = 0; I < sizeof(FT); ++I) {
+    const unsigned char T = *(ShadowType - Pos + I);
+    if (!(getValueType(T) == FTInfo<FT>::kValueType && getValuePos(T) == I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// The instrumentation automatically appends `shadow_value_type_ids`, see
+// maybeAddSuffixForNsanInterface.
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_dump_shadow_mem(const char *addr, size_t size_bytes,
+                       size_t bytes_per_line, size_t shadow_value_type_ids) {
+  const unsigned char *const ShadowType = getShadowTypeAddrFor(addr);
+  const char *const Shadow = getShadowAddrFor(addr);
+
+  constexpr const int kMaxNumDecodedValues = 16;
+  __float128 DecodedValues[kMaxNumDecodedValues];
+  int NumDecodedValues = 0;
+  if (bytes_per_line > 4 * kMaxNumDecodedValues) {
+    bytes_per_line = 4 * kMaxNumDecodedValues;
+  }
+
+  // We keep track of the current type and position as we go.
+  ValueType LastValueTy = kUnknownValueType;
+  int LastPos = -1;
+  size_t Offset = 0;
+  for (size_t R = 0; R < (size_bytes + bytes_per_line - 1) / bytes_per_line; ++R) {
+    printf("%p:    ", (void*)(addr + R * bytes_per_line));
+    for (size_t C = 0; C < bytes_per_line && Offset < size_bytes; ++C) {
+      const ValueType ValueTy = getValueType(ShadowType[Offset]);
+      const int pos = getValuePos(ShadowType[Offset]);
+      if (ValueTy == LastValueTy && pos == LastPos + 1) {
+        ++LastPos;
+      } else {
+        LastValueTy = ValueTy;
+        LastPos = pos == 0 ? 0 : -1;
+      }
+
+      switch (ValueTy) {
+      case kUnknownValueType:
+        printf("__ ");
+        break;
+      case kFloatValueType:
+        printf("f%x ", pos);
+        if (LastPos == sizeof(float) - 1) {
+          DecodedValues[NumDecodedValues] =
+              readShadow(Shadow + kShadowScale * (Offset + 1 - sizeof(float)),
+                         static_cast<char>(shadow_value_type_ids & 0xff));
+          ++NumDecodedValues;
+        }
+        break;
+      case kDoubleValueType:
+        printf("d%x ", pos);
+        if (LastPos == sizeof(double) - 1) {
+          DecodedValues[NumDecodedValues] = readShadow(
+              Shadow + kShadowScale * (Offset + 1 - sizeof(double)),
+              static_cast<char>((shadow_value_type_ids >> 8) & 0xff));
+          ++NumDecodedValues;
+        }
+        break;
+      case kFp80ValueType:
+        printf("l%x ", pos);
+        if (LastPos == sizeof(long double) - 1) {
+          DecodedValues[NumDecodedValues] = readShadow(
+              Shadow + kShadowScale * (Offset + 1 - sizeof(long double)),
+              static_cast<char>((shadow_value_type_ids >> 16) & 0xff));
+          ++NumDecodedValues;
+        }
+        break;
+      }
+      ++Offset;
+    }
+    for (int I = 0; I < NumDecodedValues; ++I) {
+      printf("  (%s)", FTPrinter<__float128>::dec(DecodedValues[I]).Buffer);
+    }
+    NumDecodedValues = 0;
+    printf("\n");
+  }
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+ALIGNED(16)
+THREADLOCAL
+uptr __nsan_shadow_ret_tag = 0;
+
+SANITIZER_INTERFACE_ATTRIBUTE
+ALIGNED(16)
+THREADLOCAL
+char __nsan_shadow_ret_ptr[kMaxVectorWidth * sizeof(__float128)];
+
+SANITIZER_INTERFACE_ATTRIBUTE
+ALIGNED(16)
+THREADLOCAL
+uptr __nsan_shadow_args_tag = 0;
+
+// Maximum number of args. This should be enough for anyone (tm). An alternate
+// scheme is to have the generated code create an alloca and make
+// __nsan_shadow_args_ptr point ot the alloca.
+constexpr const int kMaxNumArgs = 128;
+SANITIZER_INTERFACE_ATTRIBUTE
+ALIGNED(16)
+THREADLOCAL
+char __nsan_shadow_args_ptr[kMaxVectorWidth * kMaxNumArgs * sizeof(__float128)];
+
+enum ContinuationType { // Keep in sync with instrumentation pass.
+  kContinueWithShadow = 0,
+  kResumeFromValue = 1,
+};
+
+// Checks the consistency between application and shadow value. Returns true
+// when the instrumented code should resume computations from the original value
+// rather than the shadow value. This prevents one error to propagate to all
+// subsequent operations. This behaviour is tunable with flags.
+template <typename FT, typename ShadowFT>
+int32_t checkFT(const FT Value, ShadowFT Shadow, CheckTypeT CheckType,
+                uptr CheckArg) {
+  // We do all comparisons in the InternalFT domain, which is the largest FT
+  // type.
+  using InternalFT = LargestFT<FT, ShadowFT>;
+  const InternalFT CheckValue = Value;
+  const InternalFT CheckShadow = Shadow;
+
+  // See this article for an interesting discussion of how to compare floats:
+  // https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static constexpr const FT Eps = FTInfo<FT>::kEpsilon;
+
+  const InternalFT AbsErr = ftAbs(CheckValue - CheckShadow);
+
+  if (flags().enable_check_stats) {
+    GET_CALLER_PC_BP;
+    // We are re-computing `Largest` here because this is a cold branch, and we
+    // want to avoid having to move the computation of `Largest` before the
+    // absolute value check when this branch is not taken.
+    const InternalFT Largest = max(ftAbs(CheckValue), ftAbs(CheckShadow));
+    nsan_stats->addCheck(CheckType, pc, bp, AbsErr / Largest);
+  }
+
+  // Note: writing the comparison that way ensures that when `AbsErr` is Nan
+  // (value and shadow are inf or -inf), we pass the test.
+  if (!(AbsErr >= flags().cached_absolute_error_threshold))
+    return kContinueWithShadow;
+
+  const InternalFT Largest = max(ftAbs(CheckValue), ftAbs(CheckShadow));
+  if (AbsErr * (1ull << flags().log2_max_relative_error) <= Largest)
+    return kContinueWithShadow; // No problem here.
+
+  if (!flags().disable_warnings) {
+    GET_CALLER_PC_BP;
+    BufferedStackTrace stack;
+    stack.Unwind(pc, bp, nullptr, false);
+    if (GetSuppressionForStack(&stack, kSuppressionConsistency)) {
+      // FIXME: optionally print.
+      return flags().resume_after_suppression ? kResumeFromValue
+                                              : kContinueWithShadow;
+    }
+
+    Decorator D;
+    Printf("%s", D.Warning());
+    // Printf does not support float formatting.
+    char RelErrBuf[64] = "inf";
+    if (Largest > Eps) {
+      snprintf(RelErrBuf, sizeof(RelErrBuf) - 1, "%.20Lf%% (2^%.0Lf epsilons)",
+               static_cast<long double>(100.0 * AbsErr / Largest),
+               log2l(static_cast<long double>(AbsErr / Largest / Eps)));
+    }
+    char UlpErrBuf[128] = "";
+    const double ShadowUlpDiff = getULPDiff(CheckValue, CheckShadow);
+    if (ShadowUlpDiff != kMaxULPDiff) {
+      // This is the ULP diff in the internal domain. The user actually cares
+      // about that in the original domain.
+      const double UlpDiff =
+          ShadowUlpDiff / (u64{1} << (FTInfo<InternalFT>::kMantissaBits -
+                                    FTInfo<FT>::kMantissaBits));
+      snprintf(UlpErrBuf, sizeof(UlpErrBuf) - 1,
+               "(%.0f ULPs == %.1f digits == %.1f bits)", UlpDiff,
+               log10(UlpDiff), log2(UlpDiff));
+    }
+    Printf("WARNING: NumericalStabilitySanitizer: inconsistent shadow results");
+    switch (CheckType) {
+    case CheckTypeT::kUnknown:
+    case CheckTypeT::kFcmp:
+    case CheckTypeT::kMaxCheckType:
+      break;
+    case CheckTypeT::kRet:
+      Printf(" while checking return value");
+      break;
+    case CheckTypeT::kArg:
+      Printf(" while checking call argument #%d", static_cast<int>(CheckArg));
+      break;
+    case CheckTypeT::kLoad:
+      Printf(" while checking load from address 0x%x. This is due to incorrect "
+             "shadow memory tracking, typically due to uninstrumented code "
+             "writing to memory.",
+             CheckArg);
+      break;
+    case CheckTypeT::kStore:
+      Printf(" while checking store to address 0x%x", CheckArg);
+      break;
+    case CheckTypeT::kInsert:
+      Printf(" while checking vector insert");
+      break;
+    case CheckTypeT::kUser:
+      Printf(" in user-initiated check");
+      break;
+    }
+    using ValuePrinter = FTPrinter<FT>;
+    using ShadowPrinter = FTPrinter<ShadowFT>;
+    Printf("\n"
+           "%-12s precision  (native): dec: %s  hex: %s\n"
+           "%-12s precision  (shadow): dec: %s  hex: %s\n"
+           "shadow truncated to %-12s: dec: %s  hex: %s\n"
+           "Relative error: %s\n"
+           "Absolute error: %s\n"
+           "%s\n",
+           FTInfo<FT>::kCppTypeName, ValuePrinter::dec(Value).Buffer,
+           ValuePrinter::hex(Value).Buffer, FTInfo<ShadowFT>::kCppTypeName,
+           ShadowPrinter::dec(Shadow).Buffer, ShadowPrinter::hex(Shadow).Buffer,
+           FTInfo<FT>::kCppTypeName, ValuePrinter::dec(Shadow).Buffer,
+           ValuePrinter::hex(Shadow).Buffer, RelErrBuf,
+           ValuePrinter::hex(AbsErr).Buffer, UlpErrBuf, D.End());
+    stack.Print();
+  }
+
+  if (flags().enable_warning_stats) {
+    GET_CALLER_PC_BP;
+    nsan_stats->addWarning(CheckType, pc, bp, AbsErr / Largest);
+  }
+
+  if (flags().halt_on_error) {
+    Printf("Exiting\n");
+    Die();
+  }
+  return flags().resume_after_warning ? kResumeFromValue : kContinueWithShadow;
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE int32_t
+__nsan_internal_check_float_d(float value, double shadow, int32_t check_type,
+                              uptr check_arg) {
+  return checkFT(value, shadow, static_cast<CheckTypeT>(check_type), check_arg);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE int32_t
+__nsan_internal_check_double_l(double value, long double shadow,
+                               int32_t check_type, uptr check_arg) {
+  return checkFT(value, shadow, static_cast<CheckTypeT>(check_type), check_arg);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE int32_t
+__nsan_internal_check_double_q(double value, __float128 shadow,
+                               int32_t check_type, uptr check_arg) {
+  return checkFT(value, shadow, static_cast<CheckTypeT>(check_type), check_arg);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE int32_t
+__nsan_internal_check_longdouble_q(long double value, __float128 shadow,
+                                   int32_t check_type, uptr check_arg) {
+  return checkFT(value, shadow, static_cast<CheckTypeT>(check_type), check_arg);
+}
+
+static const char *getTruthValueName(bool v) { return v ? "true" : "false"; }
+
+// This uses the same values as CmpInst::Predicate.
+static const char *getPredicateName(int v) {
+  switch (v) {
+  case 0:
+    return "(false)";
+  case 1:
+    return "==";
+  case 2:
+    return ">";
+  case 3:
+    return ">=";
+  case 4:
+    return "<";
+  case 5:
+    return "<=";
+  case 6:
+    return "!=";
+  case 7:
+    return "(ordered)";
+  case 8:
+    return "(unordered)";
+  case 9:
+    return "==";
+  case 10:
+    return ">";
+  case 11:
+    return ">=";
+  case 12:
+    return "<";
+  case 13:
+    return "<=";
+  case 14:
+    return "!=";
+  case 15:
+    return "(true)";
+  }
+  return "??";
+}
+
+template <typename FT, typename ShadowFT>
+void fCmpFailFT(const FT Lhs, const FT Rhs, ShadowFT LhsShadow,
+                ShadowFT RhsShadow, int Predicate, bool Result,
+                bool ShadowResult) {
+  if (Result == ShadowResult) {
+    // When a vector comparison fails, we fail each element of the comparison
+    // to simplify instrumented code. Skip elements where the shadow comparison
+    // gave the same result as the original one.
+    return;
+  }
+
+  GET_CALLER_PC_BP;
+  BufferedStackTrace Stack;
+  Stack.Unwind(pc, bp, nullptr, false);
+
+  if (GetSuppressionForStack(&Stack, kSuppressionFcmp)) {
+    // FIXME: optionally print.
+    return;
+  }
+
+  if (flags().enable_warning_stats) {
+    nsan_stats->addWarning(CheckTypeT::kFcmp, pc, bp, 0.0);
+  }
+
+  if (flags().disable_warnings) {
+    return;
+  }
+
+  // FIXME: ideally we would print the shadow value as FP128. Right now because
+  // we truncate to long double we can sometimes see stuff like:
+  // shadow <value> == <value> (false)
+  using ValuePrinter = FTPrinter<FT>;
+  using ShadowPrinter = FTPrinter<ShadowFT>;
+  Decorator D;
+  const char *const PredicateName = getPredicateName(Predicate);
+  Printf("%s", D.Warning());
+  Printf("WARNING: NumericalStabilitySanitizer: floating-point comparison "
+         "results depend on precision\n"
+         "%-12s precision dec (native): %s %s %s (%s)\n"
+         "%-12s precision dec (shadow): %s %s %s (%s)\n"
+         "%-12s precision hex (native): %s %s %s (%s)\n"
+         "%-12s precision hex (shadow): %s %s %s (%s)\n"
+         "%s",
+         // Native, decimal.
+         FTInfo<FT>::kCppTypeName, ValuePrinter::dec(Lhs).Buffer, PredicateName,
+         ValuePrinter::dec(Rhs).Buffer, getTruthValueName(Result),
+         // Shadow, decimal
+         FTInfo<ShadowFT>::kCppTypeName, ShadowPrinter::dec(LhsShadow).Buffer,
+         PredicateName, ShadowPrinter::dec(RhsShadow).Buffer,
+         getTruthValueName(ShadowResult),
+         // Native, hex.
+         FTInfo<FT>::kCppTypeName, ValuePrinter::hex(Lhs).Buffer, PredicateName,
+         ValuePrinter::hex(Rhs).Buffer, getTruthValueName(Result),
+         // Shadow, hex
+         FTInfo<ShadowFT>::kCppTypeName, ShadowPrinter::hex(LhsShadow).Buffer,
+         PredicateName, ShadowPrinter::hex(RhsShadow).Buffer,
+         getTruthValueName(ShadowResult), D.End());
+  Printf("%s", D.Default());
+  Stack.Print();
+  if (flags().halt_on_error) {
+    Printf("Exiting\n");
+    Die();
+  }
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_fcmp_fail_float_d(float lhs, float rhs, double lhs_shadow,
+                         double rhs_shadow, int predicate, bool result,
+                         bool shadow_result) {
+  fCmpFailFT(lhs, rhs, lhs_shadow, rhs_shadow, predicate, result,
+             shadow_result);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_fcmp_fail_double_q(double lhs, double rhs, __float128 lhs_shadow,
+                          __float128 rhs_shadow, int predicate, bool result,
+                          bool shadow_result) {
+  fCmpFailFT(lhs, rhs, lhs_shadow, rhs_shadow, predicate, result,
+             shadow_result);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_fcmp_fail_double_l(double lhs, double rhs, long double lhs_shadow,
+                          long double rhs_shadow, int predicate, bool result,
+                          bool shadow_result) {
+  fCmpFailFT(lhs, rhs, lhs_shadow, rhs_shadow, predicate, result,
+             shadow_result);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_fcmp_fail_longdouble_q(long double lhs, long double rhs,
+                              __float128 lhs_shadow, __float128 rhs_shadow,
+                              int predicate, bool result, bool shadow_result) {
+  fCmpFailFT(lhs, rhs, lhs_shadow, rhs_shadow, predicate, result,
+             shadow_result);
+}
+
+template <typename FT> void checkFTFromShadowStack(const FT Value) {
+  // Get the shadow 2FT value from the shadow stack. Note that
+  // __nsan_check_{float,double,long double} is a function like any other, so
+  // the instrumentation will have placed the shadow value on the shadow stack.
+  using ShadowFT = typename FTInfo<FT>::shadow_type;
+  ShadowFT Shadow;
+  __builtin_memcpy(&Shadow, __nsan_shadow_args_ptr, sizeof(ShadowFT));
+  checkFT(Value, Shadow, CheckTypeT::kUser, 0);
+}
+
+// FIXME: Add suffixes and let the instrumentation pass automatically add
+// suffixes.
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_check_float(float Value) {
+  assert(__nsan_shadow_args_tag == (uptr)&__nsan_check_float &&
+         "__nsan_check_float called from non-instrumented function");
+  checkFTFromShadowStack(Value);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_check_double(double Value) {
+  assert(__nsan_shadow_args_tag == (uptr)&__nsan_check_double &&
+         "__nsan_check_double called from non-instrumented function");
+  checkFTFromShadowStack(Value);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_check_longdouble(long double Value) {
+  assert(__nsan_shadow_args_tag == (uptr)&__nsan_check_longdouble &&
+         "__nsan_check_longdouble called from non-instrumented function");
+  checkFTFromShadowStack(Value);
+}
+
+template <typename FT> static void dumpFTFromShadowStack(const FT Value) {
+  // Get the shadow 2FT value from the shadow stack. Note that
+  // __nsan_dump_{float,double,long double} is a function like any other, so
+  // the instrumentation will have placed the shadow value on the shadow stack.
+  using ShadowFT = typename FTInfo<FT>::shadow_type;
+  ShadowFT Shadow;
+  __builtin_memcpy(&Shadow, __nsan_shadow_args_ptr, sizeof(ShadowFT));
+  using ValuePrinter = FTPrinter<FT>;
+  using ShadowPrinter = FTPrinter<typename FTInfo<FT>::shadow_type>;
+  printf("value  dec:%s hex:%s\n"
+         "shadow dec:%s hex:%s\n",
+         ValuePrinter::dec(Value).Buffer, ValuePrinter::hex(Value).Buffer,
+         ShadowPrinter::dec(Shadow).Buffer, ShadowPrinter::hex(Shadow).Buffer);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_dump_float(float Value) {
+  assert(__nsan_shadow_args_tag == (uptr)&__nsan_dump_float &&
+         "__nsan_dump_float called from non-instrumented function");
+  dumpFTFromShadowStack(Value);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_dump_double(double Value) {
+  assert(__nsan_shadow_args_tag == (uptr)&__nsan_dump_double &&
+         "__nsan_dump_double called from non-instrumented function");
+  dumpFTFromShadowStack(Value);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__nsan_dump_longdouble(long double Value) {
+  assert(__nsan_shadow_args_tag == (uptr)&__nsan_dump_longdouble &&
+         "__nsan_dump_longdouble called from non-instrumented function");
+  dumpFTFromShadowStack(Value);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_dump_shadow_ret() {
+  printf("ret tag: %lx\n", __nsan_shadow_ret_tag);
+  double V;
+  __builtin_memcpy(&V, __nsan_shadow_ret_ptr, sizeof(double));
+  printf("double Value: %f\n", V);
+  // FIXME: float128 value.
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_dump_shadow_args() {
+  printf("args tag: %lx\n", __nsan_shadow_args_tag);
+}
+
+namespace __nsan {
+bool NsanInitialized = false;
+bool NsanInitIsRunning;
+} // end namespace __nsan
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_init() {
+  CHECK(!NsanInitIsRunning);
+  if (NsanInitialized)
+    return;
+  NsanInitIsRunning = true;
+
+  InitializeFlags();
+  InitializeSuppressions();
+  InitializePlatformEarly();
+
+  if (!MmapFixedNoReserve(TypesAddr(), UnusedAddr() - TypesAddr()))
+    Die();
+
+  initializeInterceptors();
+
+  initializeStats();
+  if (flags().print_stats_on_exit)
+    Atexit(nsanAtexit);
+
+  NsanInitIsRunning = false;
+  NsanInitialized = true;
+}
+
+#if SANITIZER_CAN_USE_PREINIT_ARRAY
+__attribute__((section(".preinit_array"),
+               used)) static void (*nsan_init_ptr)() = __nsan_init;
+#endif
diff --git a/compiler-rt/lib/nsan/nsan.syms.extra b/compiler-rt/lib/nsan/nsan.syms.extra
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan.syms.extra
@@ -0,0 +1,2 @@
+nsan_*
+__nsan_*
\ No newline at end of file
diff --git a/compiler-rt/lib/nsan/nsan_flags.h b/compiler-rt/lib/nsan/nsan_flags.h
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan_flags.h
@@ -0,0 +1,35 @@
+//===-- nsan_flags.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of NumericalStabilitySanitizer.
+//===----------------------------------------------------------------------===//
+
+#ifndef NSAN_FLAGS_H
+#define NSAN_FLAGS_H
+
+namespace __nsan {
+
+struct Flags {
+#define NSAN_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "nsan_flags.inc"
+#undef NSAN_FLAG
+
+  double cached_absolute_error_threshold = 0.0;
+
+  void SetDefaults();
+  void PopulateCache();
+};
+
+extern Flags flags_data;
+inline Flags &flags() { return flags_data; }
+
+void InitializeFlags();
+
+} // namespace __nsan
+
+#endif
diff --git a/compiler-rt/lib/nsan/nsan_flags.cc b/compiler-rt/lib/nsan/nsan_flags.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan_flags.cc
@@ -0,0 +1,78 @@
+//===-- nsan_flags.cc -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of NumericalStabilitySanitizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "nsan_flags.h"
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_flags.h"
+
+namespace __nsan {
+
+SANITIZER_INTERFACE_WEAK_DEF(const char *, __nsan_default_options, void) {
+  return "";
+}
+
+using namespace __sanitizer;
+
+Flags flags_data;
+
+void Flags::SetDefaults() {
+#define NSAN_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "nsan_flags.inc"
+#undef NSAN_FLAG
+}
+
+void Flags::PopulateCache() {
+  cached_absolute_error_threshold =
+      1.0 / (1ull << log2_absolute_error_threshold);
+}
+
+static void RegisterNSanFlags(FlagParser *parser, Flags *f) {
+#define NSAN_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(parser, #Name, Description, &f->Name);
+#include "nsan_flags.inc"
+#undef NSAN_FLAG
+}
+
+static const char *MaybeCallNsanDefaultOptions() {
+  return (&__nsan_default_options) ? __nsan_default_options() : "";
+}
+
+void InitializeFlags() {
+  SetCommonFlagsDefaults();
+  {
+    CommonFlags cf;
+    cf.CopyFrom(*common_flags());
+    cf.external_symbolizer_path = GetEnv("NSAN_SYMBOLIZER_PATH");
+    OverrideCommonFlags(cf);
+  }
+
+  flags().SetDefaults();
+
+  FlagParser parser;
+  RegisterCommonFlags(&parser);
+  RegisterNSanFlags(&parser, &flags());
+
+  const char *nsan_default_options = MaybeCallNsanDefaultOptions();
+  parser.ParseString(nsan_default_options);
+
+  parser.ParseString(GetEnv("NSAN_OPTIONS"));
+  InitializeCommonFlags();
+  if (Verbosity())
+    ReportUnrecognizedFlags();
+  if (common_flags()->help)
+    parser.PrintFlagDescriptions();
+
+  flags().PopulateCache();
+}
+
+} // namespace __nsan
diff --git a/compiler-rt/lib/nsan/nsan_flags.inc b/compiler-rt/lib/nsan/nsan_flags.inc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan_flags.inc
@@ -0,0 +1,49 @@
+//===-- nsan_flags.inc ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// NSan runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef NSAN_FLAG
+# error "Define NSAN_FLAG prior to including this file!"
+#endif
+
+// NSAN_FLAG(Type, Name, DefaultValue, Description)
+// See COMMON_FLAG in sanitizer_flags.inc for more details.
+
+NSAN_FLAG(bool, halt_on_error, true, "If true, halt after the first error.")
+NSAN_FLAG(bool, resume_after_warning, true,
+          "If true, we resume resume the computation from the original "
+          "application floating-point value after a warning. If false, "
+          "computations continue with the shadow value.")
+NSAN_FLAG(const char *, suppressions, "", "Suppressions file name.")
+NSAN_FLAG(bool, resume_after_suppression, true,
+          "If true, a suppression will also resume the computation from the FT"
+          " domain. If false, output is suppressed but the shadow value is"
+          " retained.")
+// FIXME: should this be specified in units of epsilon instead?
+NSAN_FLAG(int, log2_max_relative_error, 19,
+          "Log2 maximum admissible relative error, e.g. 19 means max relative "
+          "error of 1/2^19 ~= 0.000002.")
+NSAN_FLAG(int, log2_absolute_error_threshold, 32,
+          "Log2 maximum admissible absolute error. Any numbers closer than "
+          "1/2^n are considered to be the same.")
+NSAN_FLAG(bool, disable_warnings, false,
+          "If true, disable warning printing. This is useful to only compute "
+          "stats.")
+NSAN_FLAG(bool, enable_check_stats, false,
+          "If true, compute check stats, i.e. for each line, the number of "
+          "times a check was performed on this line.")
+NSAN_FLAG(bool, enable_warning_stats, false,
+          "If true, compute warning stats, i.e. for each line, the number of "
+          "times a warning was emitted for this line.")
+NSAN_FLAG(bool, enable_loadtracking_stats, false,
+          "If true, compute load tracking stats, i.e. for each load from "
+          "memory, the number of times nsan resumed from the original value "
+          "due to invalid or unknown types.")
+NSAN_FLAG(bool, print_stats_on_exit, false, "If true, print stats on exit.")
diff --git a/compiler-rt/lib/nsan/nsan_interceptors.cc b/compiler-rt/lib/nsan/nsan_interceptors.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan_interceptors.cc
@@ -0,0 +1,367 @@
+//===-- nsan_interceptors.cc ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interceptors for standard library functions.
+//
+// A note about `printf`: Make sure none of the interceptor code calls any
+// part of the nsan framework that can call `printf`, since this could create
+// a loop (`printf` itself uses the libc). printf-free functions are documented
+// as such in nsan.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "interception/interception.h"
+#include "nsan/nsan.h"
+#include "sanitizer_common/sanitizer_common.h"
+
+#include <wchar.h>
+
+#if SANITIZER_LINUX
+extern "C" int mallopt(int param, int value);
+#endif
+
+using namespace __sanitizer;
+using __nsan::NsanInitialized;
+using __nsan::NsanInitIsRunning;
+
+static constexpr uptr kEarlyAllocBufSize = 16384;
+static uptr AllocatedBytes;
+static char EarlyAllocBuf[kEarlyAllocBufSize];
+
+static bool isInEarlyAllocBuf(const void *Ptr) {
+  return ((uptr)Ptr >= (uptr)EarlyAllocBuf &&
+          ((uptr)Ptr - (uptr)EarlyAllocBuf) < sizeof(EarlyAllocBuf));
+}
+
+static char *toCharPtr(wchar_t *ptr) { return reinterpret_cast<char *>(ptr); }
+static const char *toCharPtr(const wchar_t *ptr) {
+  return reinterpret_cast<const char *>(ptr);
+}
+
+template<typename T>
+T min(T a, T b) {
+  return a < b ? a : b;
+}
+
+// Handle allocation requests early (before all interceptors are setup). dlsym,
+// for example, calls calloc.
+static void *handleEarlyAlloc(uptr Size) {
+  void *const Mem = (void *)&EarlyAllocBuf[AllocatedBytes];
+  AllocatedBytes += Size;
+  CHECK_LT(AllocatedBytes, kEarlyAllocBufSize);
+  return Mem;
+}
+
+INTERCEPTOR(void *, memset, void *Dst, int V, uptr Size) {
+  // NOTE: This guard is needed because nsan's initialization code might call
+  // memset.
+  if (!NsanInitialized && REAL(memset) == nullptr)
+    return internal_memset(Dst, V, Size);
+
+  void *const Res = REAL(memset)(Dst, V, Size);
+  __nsan_set_value_unknown(static_cast<char *>(Dst), Size);
+  return Res;
+}
+
+INTERCEPTOR(wchar_t *, wmemset, wchar_t *Dst, wchar_t V, uptr Size) {
+  wchar_t *const Res = REAL(wmemset)(Dst, V, Size);
+  __nsan_set_value_unknown(toCharPtr(Dst), sizeof(wchar_t) * Size);
+  return Res;
+}
+
+INTERCEPTOR(void *, memmove, void *Dst, const void *Src, uptr Size) {
+  // NOTE: This guard is needed because nsan's initialization code might call
+  // memmove.
+  if (!NsanInitialized && REAL(memmove) == nullptr)
+    return internal_memmove(Dst, Src, Size);
+
+  void *const Res = REAL(memmove)(Dst, Src, Size);
+  __nsan_copy_values(static_cast<char *>(Dst), static_cast<const char *>(Src),
+                     Size);
+  return Res;
+}
+
+INTERCEPTOR(wchar_t *, wmemmove, wchar_t *Dst, const wchar_t *Src, uptr Size) {
+  wchar_t *const Res = REAL(wmemmove)(Dst, Src, Size);
+  __nsan_copy_values(toCharPtr(Dst), toCharPtr(Src), sizeof(wchar_t) * Size);
+  return Res;
+}
+
+INTERCEPTOR(void *, memcpy, void *Dst, const void *Src, uptr Size) {
+  // NOTE: This guard is needed because nsan's initialization code might call
+  // memcpy.
+  if (!NsanInitialized && REAL(memcpy) == nullptr) {
+    // memmove is used here because on some platforms this will also
+    // intercept the memmove implementation.
+    return internal_memmove(Dst, Src, Size);
+  }
+
+  void *const Res = REAL(memcpy)(Dst, Src, Size);
+  __nsan_copy_values(static_cast<char *>(Dst), static_cast<const char *>(Src),
+                     Size);
+  return Res;
+}
+
+INTERCEPTOR(wchar_t *, wmemcpy, wchar_t *Dst, const wchar_t *Src, uptr Size) {
+  wchar_t *const Res = REAL(wmemcpy)(Dst, Src, Size);
+  __nsan_copy_values(toCharPtr(Dst), toCharPtr(Src), sizeof(wchar_t) * Size);
+  return Res;
+}
+
+INTERCEPTOR(void *, malloc, uptr Size) {
+  // NOTE: This guard is needed because nsan's initialization code might call
+  // malloc.
+  if (NsanInitIsRunning && REAL(malloc) == nullptr)
+    return handleEarlyAlloc(Size);
+
+  void *const Res = REAL(malloc)(Size);
+  if (Res)
+    __nsan_set_value_unknown(static_cast<char *>(Res), Size);
+  return Res;
+}
+
+INTERCEPTOR(void *, realloc, void *Ptr, uptr Size) {
+  void *const Res = REAL(realloc)(Ptr, Size);
+  // FIXME: We might want to copy the types from the original allocation
+  // (although that would require that we know its size).
+  if (Res)
+    __nsan_set_value_unknown(static_cast<char *>(Res), Size);
+  return Res;
+}
+
+INTERCEPTOR(void *, calloc, uptr Nmemb, uptr Size) {
+  // NOTE: This guard is needed because nsan's initialization code might call
+  // calloc.
+  if (NsanInitIsRunning && REAL(calloc) == nullptr) {
+    // Note: EarlyAllocBuf is initialized with zeros.
+    return handleEarlyAlloc(Nmemb * Size);
+  }
+
+  void *const Res = REAL(calloc)(Nmemb, Size);
+  if (Res)
+    __nsan_set_value_unknown(static_cast<char *>(Res), Nmemb * Size);
+  return Res;
+}
+
+INTERCEPTOR(void, free, void *P) {
+  // There are only a few early allocation requests, so we simply skip the free.
+  if (isInEarlyAllocBuf(P))
+    return;
+  REAL(free)(P);
+}
+
+INTERCEPTOR(void *, valloc, uptr Size) {
+  void *const Res = REAL(valloc)(Size);
+  if (Res)
+    __nsan_set_value_unknown(static_cast<char *>(Res), Size);
+  return Res;
+}
+
+INTERCEPTOR(void *, memalign, uptr Alignment, uptr Size) {
+  void *const Res = REAL(memalign)(Alignment, Size);
+  if (Res)
+    __nsan_set_value_unknown(static_cast<char *>(Res), Size);
+  return Res;
+}
+
+INTERCEPTOR(void *, __libc_memalign, uptr Alignment, uptr Size) {
+  void *const Res = REAL(__libc_memalign)(Alignment, Size);
+  if (Res)
+    __nsan_set_value_unknown(static_cast<char *>(Res), Size);
+  return Res;
+}
+
+INTERCEPTOR(void *, pvalloc, uptr Size) {
+  void *const Res = REAL(pvalloc)(Size);
+  if (Res)
+    __nsan_set_value_unknown(static_cast<char *>(Res), Size);
+  return Res;
+}
+
+INTERCEPTOR(void *, aligned_alloc, uptr Alignment, uptr Size) {
+  void *const Res = REAL(aligned_alloc)(Alignment, Size);
+  if (Res)
+    __nsan_set_value_unknown(static_cast<char *>(Res), Size);
+  return Res;
+}
+
+INTERCEPTOR(int, posix_memalign, void **Memptr, uptr Alignment, uptr Size) {
+  int Res = REAL(posix_memalign)(Memptr, Alignment, Size);
+  if (Res == 0 && *Memptr)
+    __nsan_set_value_unknown(static_cast<char *>(*Memptr), Size);
+  return Res;
+}
+
+INTERCEPTOR(char *, strfry, char *S) {
+  const auto Len = internal_strlen(S);
+  char *const Res = REAL(strfry)(S);
+  if (Res)
+    __nsan_set_value_unknown(S, Len);
+  return Res;
+}
+
+INTERCEPTOR(char *, strsep, char **Stringp, const char *Delim) {
+  char *const OrigStringp = REAL(strsep)(Stringp, Delim);
+  if (Stringp != nullptr) {
+    // The previous character has been overwritten with a '\0' char.
+    __nsan_set_value_unknown(*Stringp - 1, 1);
+  }
+  return OrigStringp;
+}
+
+INTERCEPTOR(char *, strtok, char *Str, const char *Delim) {
+  // This is overly conservative, but the probability that modern code is using
+  // strtok on double data is essentially zero anyway.
+  if (Str)
+    __nsan_set_value_unknown(Str, internal_strlen(Str));
+  return REAL(strtok)(Str, Delim);
+}
+
+static void nsanCopyZeroTerminated(const char *Dst, const char *Src, uptr N) {
+  __nsan_copy_values(Dst, Src, N);      // Data.
+  __nsan_set_value_unknown(Dst + N, 1); // Terminator.
+}
+
+static void nsanWCopyZeroTerminated(const wchar_t *Dst, const wchar_t *Src,
+                                    uptr N) {
+  __nsan_copy_values(toCharPtr(Dst), toCharPtr(Src), sizeof(wchar_t) * N);
+  __nsan_set_value_unknown(toCharPtr(Dst + N), sizeof(wchar_t));
+}
+
+INTERCEPTOR(char *, strdup, const char *S) {
+  char *const Res = REAL(strdup)(S);
+  if (Res) {
+    nsanCopyZeroTerminated(Res, S, internal_strlen(S));
+  }
+  return Res;
+}
+
+INTERCEPTOR(wchar_t *, wcsdup, const wchar_t *S) {
+  wchar_t *const Res = REAL(wcsdup)(S);
+  if (Res) {
+    nsanWCopyZeroTerminated(Res, S, wcslen(S));
+  }
+  return Res;
+}
+
+INTERCEPTOR(char *, strndup, const char *S, uptr Size) {
+  char *const Res = REAL(strndup)(S, Size);
+  if (Res) {
+    nsanCopyZeroTerminated(Res, S, min(internal_strlen(S), Size));
+  }
+  return Res;
+}
+
+INTERCEPTOR(char *, strcpy, char *Dst, const char *Src) {
+  char *const Res = REAL(strcpy)(Dst, Src);
+  nsanCopyZeroTerminated(Dst, Src, internal_strlen(Src));
+  return Res;
+}
+
+INTERCEPTOR(wchar_t *, wcscpy, wchar_t *Dst, const wchar_t *Src) {
+  wchar_t *const Res = REAL(wcscpy)(Dst, Src);
+  nsanWCopyZeroTerminated(Dst, Src, wcslen(Src));
+  return Res;
+}
+
+INTERCEPTOR(char *, strncpy, char *Dst, const char *Src, uptr Size) {
+  char *const Res = REAL(strncpy)(Dst, Src, Size);
+  nsanCopyZeroTerminated(Dst, Src, min(Size, internal_strlen(Src)));
+  return Res;
+}
+
+INTERCEPTOR(char *, strcat, char *Dst, const char *Src) {
+  const auto DstLenBeforeCat = internal_strlen(Dst);
+  char *const Res = REAL(strcat)(Dst, Src);
+  nsanCopyZeroTerminated(Dst + DstLenBeforeCat, Src, internal_strlen(Src));
+  return Res;
+}
+
+INTERCEPTOR(wchar_t *, wcscat, wchar_t *Dst, const wchar_t *Src) {
+  const auto DstLenBeforeCat = wcslen(Dst);
+  wchar_t *const Res = REAL(wcscat)(Dst, Src);
+  nsanWCopyZeroTerminated(Dst + DstLenBeforeCat, Src, wcslen(Src));
+  return Res;
+}
+
+INTERCEPTOR(char *, strncat, char *Dst, const char *Src, uptr Size) {
+  const auto DstLen = internal_strlen(Dst);
+  char *const Res = REAL(strncat)(Dst, Src, Size);
+  nsanCopyZeroTerminated(Dst + DstLen, Src,
+                         min(Size, internal_strlen(Src)));
+  return Res;
+}
+
+INTERCEPTOR(char *, stpcpy, char *Dst, const char *Src) {
+  char *const Res = REAL(stpcpy)(Dst, Src);
+  nsanCopyZeroTerminated(Dst, Src, internal_strlen(Src));
+  return Res;
+}
+
+INTERCEPTOR(wchar_t *, wcpcpy, wchar_t *Dst, const wchar_t *Src) {
+  wchar_t *const Res = REAL(wcpcpy)(Dst, Src);
+  nsanWCopyZeroTerminated(Dst, Src, wcslen(Src));
+  return Res;
+}
+
+INTERCEPTOR(uptr, strxfrm, char *Dst, const char *Src, uptr Size) {
+  // This is overly conservative, but this function should very rarely be used.
+  __nsan_set_value_unknown(Dst, internal_strlen(Dst));
+  const uptr Res = REAL(strxfrm)(Dst, Src, Size);
+  return Res;
+}
+
+namespace __nsan {
+void initializeInterceptors() {
+  static bool Initialized = false;
+  CHECK(!Initialized);
+
+  // Instruct libc malloc to consume less memory.
+#if SANITIZER_LINUX
+  mallopt(1, 0);          // M_MXFAST
+  mallopt(-3, 32 * 1024); // M_MMAP_THRESHOLD
+#endif
+
+  INTERCEPT_FUNCTION(malloc);
+  INTERCEPT_FUNCTION(calloc);
+  INTERCEPT_FUNCTION(free);
+  INTERCEPT_FUNCTION(realloc);
+  INTERCEPT_FUNCTION(valloc);
+  INTERCEPT_FUNCTION(memalign);
+  INTERCEPT_FUNCTION(__libc_memalign);
+  INTERCEPT_FUNCTION(pvalloc);
+  INTERCEPT_FUNCTION(aligned_alloc);
+  INTERCEPT_FUNCTION(posix_memalign);
+
+  INTERCEPT_FUNCTION(memset);
+  INTERCEPT_FUNCTION(wmemset);
+  INTERCEPT_FUNCTION(memmove);
+  INTERCEPT_FUNCTION(wmemmove);
+  INTERCEPT_FUNCTION(memcpy);
+  INTERCEPT_FUNCTION(wmemcpy);
+
+  INTERCEPT_FUNCTION(strdup);
+  INTERCEPT_FUNCTION(wcsdup);
+  INTERCEPT_FUNCTION(strndup);
+  INTERCEPT_FUNCTION(stpcpy);
+  INTERCEPT_FUNCTION(wcpcpy);
+  INTERCEPT_FUNCTION(strcpy);
+  INTERCEPT_FUNCTION(wcscpy);
+  INTERCEPT_FUNCTION(strncpy);
+  INTERCEPT_FUNCTION(strcat);
+  INTERCEPT_FUNCTION(wcscat);
+  INTERCEPT_FUNCTION(strncat);
+  INTERCEPT_FUNCTION(strxfrm);
+
+  INTERCEPT_FUNCTION(strfry);
+  INTERCEPT_FUNCTION(strsep);
+  INTERCEPT_FUNCTION(strtok);
+
+  Initialized = 1;
+}
+} // end namespace __nsan
diff --git a/compiler-rt/lib/nsan/nsan_platform.h b/compiler-rt/lib/nsan/nsan_platform.h
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan_platform.h
@@ -0,0 +1,144 @@
+//===------------------------ nsan_platform.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Platform specific information for NSan.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NSAN_PLATFORM_H
+#define NSAN_PLATFORM_H
+
+namespace __nsan {
+
+// NSan uses two regions of memory to store information:
+// - 'shadow memory' stores the shadow copies of numerical values stored in
+//   application memory.
+// - 'shadow types' is used to determine which value type each byte of memory
+//   belongs to. This makes sure that we always know whether a shadow value is
+//   valid. Shadow values may be tampered with using access through other
+//   pointer types (type punning). Each byte stores:
+//     - bit 1-0: whether the corresponding value is of unknown (00),
+//       float (01), double (10), or long double (11) type.
+//     - bit 5-2: the index of this byte in the value, or 0000 if type is
+//       unknown.
+//       This allows handling unaligned loat load/stores by checking that a load
+//       with a given alignment corresponds to the alignment of the store.
+//       Any store of a non-floating point type invalidates the corresponding
+//       bytes, so that subsequent overlapping loads (aligned or not) know that
+//       the corresponding shadow value is no longer valid.
+
+// On Linux/x86_64, memory is laid out as follows:
+//
+// +--------------------+ 0x800000000000 (top of memory)
+// | application memory |
+// +--------------------+ 0x700000008000 (kAppAddr)
+// |                    |
+// |       unused       |
+// |                    |
+// +--------------------+ 0x400000000000 (kUnusedAddr)
+// |   shadow memory    |
+// +--------------------+ 0x200000000000 (kShadowAddr)
+// |   shadow types     |
+// +--------------------+ 0x100000000000 (kTypesAddr)
+// | reserved by kernel |
+// +--------------------+ 0x000000000000
+//
+//
+// To derive a shadow memory address from an application memory address,
+// bits 44-46 are cleared to bring the address into the range
+// [0x000000000000,0x100000000000).  We scale to account for the fact that a
+// shadow value takes twice as much space as the original value.
+// Then we add kShadowAddr to put the shadow relative offset into the shadow
+// memory. See getShadowAddrFor().
+// The process is similar for the shadow types.
+
+// The ratio of app to shadow memory.
+enum {
+  kShadowScale = 2
+};
+
+// The original value type of a byte in app memory. Uses LLVM terminology:
+// https://llvm.org/docs/LangRef.html#floating-point-types
+// FIXME: support half and bfloat.
+enum ValueType {
+  kUnknownValueType = 0,
+  kFloatValueType = 1,      // LLVM float, shadow type double.
+  kDoubleValueType = 2,     // LLVM double, shadow type fp128.
+  kFp80ValueType = 3,       // LLVM x86_fp80, shadow type fp128.
+};
+
+// The size of ValueType encoding, in bits.
+enum {
+  kValueSizeSizeBits = 2,
+};
+
+#if defined(__x86_64__)
+struct Mapping {
+  // FIXME: kAppAddr == 0x700000000000 ?
+  static const uptr kAppAddr =    0x700000008000;
+  static const uptr kUnusedAddr = 0x400000000000;
+  static const uptr kShadowAddr = 0x200000000000;
+  static const uptr kTypesAddr =  0x100000000000;
+  static const uptr kShadowMask = ~0x700000000000;
+};
+#else
+# error "NSan not supported for this platform!"
+#endif
+
+enum MappingType {
+  MAPPING_APP_ADDR,
+  MAPPING_UNUSED_ADDR,
+  MAPPING_SHADOW_ADDR,
+  MAPPING_TYPES_ADDR,
+  MAPPING_SHADOW_MASK
+};
+
+template<typename Mapping, int Type>
+uptr MappingImpl() {
+  switch (Type) {
+    case MAPPING_APP_ADDR: return Mapping::kAppAddr;
+    case MAPPING_UNUSED_ADDR: return Mapping::kUnusedAddr;
+    case MAPPING_SHADOW_ADDR: return Mapping::kShadowAddr;
+    case MAPPING_TYPES_ADDR: return Mapping::kTypesAddr;
+    case MAPPING_SHADOW_MASK: return Mapping::kShadowMask;
+  }
+}
+
+template<int Type>
+uptr MappingArchImpl() {
+  return MappingImpl<Mapping, Type>();
+}
+
+ALWAYS_INLINE
+uptr AppAddr() {
+  return MappingArchImpl<MAPPING_APP_ADDR>();
+}
+
+ALWAYS_INLINE
+uptr UnusedAddr() {
+  return MappingArchImpl<MAPPING_UNUSED_ADDR>();
+}
+
+ALWAYS_INLINE
+uptr ShadowAddr() {
+  return MappingArchImpl<MAPPING_SHADOW_ADDR>();
+}
+
+ALWAYS_INLINE
+uptr TypesAddr() {
+  return MappingArchImpl<MAPPING_TYPES_ADDR>();
+}
+
+ALWAYS_INLINE
+uptr ShadowMask() {
+  return MappingArchImpl<MAPPING_SHADOW_MASK>();
+}
+
+}  // end namespace __nsan
+
+#endif
diff --git a/compiler-rt/lib/nsan/nsan_stats.h b/compiler-rt/lib/nsan/nsan_stats.h
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan_stats.h
@@ -0,0 +1,92 @@
+//===-- nsan_stats.h --------------------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of NumericalStabilitySanitizer.
+//
+// NSan statistics. This class counts the number of checks per code location,
+// and is used to output statistics (typically when using
+// `disable_warnings=1,enable_check_stats=1,enable_warning_stats=1`).
+//===----------------------------------------------------------------------===//
+
+#ifndef NSAN_STATS_H
+#define NSAN_STATS_H
+
+#include "sanitizer_common/sanitizer_addrhashmap.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+
+namespace __nsan {
+
+enum class CheckTypeT {
+  kUnknown = 0,
+  kRet,
+  kArg,
+  kLoad,
+  kStore,
+  kInsert,
+  kUser, // User initiated.
+  kFcmp,
+  kMaxCheckType,
+};
+
+class Stats {
+public:
+  Stats();
+  ~Stats();
+
+  // Signal that we checked the instruction at the given address.
+  void addCheck(CheckTypeT CheckType, __sanitizer::uptr PC,
+                __sanitizer::uptr BP, double RelErr);
+  // Signal that we warned for the instruction at the given address.
+  void addWarning(CheckTypeT CheckType, __sanitizer::uptr PC,
+                  __sanitizer::uptr BP, double RelErr);
+
+  // Signal that we detected a floating-point load where the shadow type was
+  // invalid.
+  void addInvalidLoadTrackingEvent(__sanitizer::uptr PC, __sanitizer::uptr BP);
+  // Signal that we detected a floating-point load where the shadow type was
+  // unknown but the value was nonzero.
+  void addUnknownLoadTrackingEvent(__sanitizer::uptr PC, __sanitizer::uptr BP);
+
+  void print() const;
+
+private:
+  using IndexMap = __sanitizer::AddrHashMap<__sanitizer::uptr, 11>;
+
+  struct CheckAndWarningsValue {
+    CheckTypeT CheckTy;
+    __sanitizer::u32 StackId = 0;
+    __sanitizer::u64 NumChecks = 0;
+    __sanitizer::u64 NumWarnings = 0;
+    // This is a bitcasted double. Doubles have the nice idea to be ordered as
+    // ints.
+    double MaxRelativeError = 0;
+  };
+  // Maps key(CheckType, StackId) to indices in CheckAndWarnings.
+  IndexMap CheckAndWarningsMap;
+  __sanitizer::InternalMmapVectorNoCtor<CheckAndWarningsValue> CheckAndWarnings;
+  mutable __sanitizer::BlockingMutex CheckAndWarningsMutex;
+
+  struct LoadTrackingValue {
+    CheckTypeT CheckTy;
+    __sanitizer::u32 StackId = 0;
+    __sanitizer::u64 NumInvalid = 0;
+    __sanitizer::u64 NumUnknown = 0;
+  };
+  // Maps key(CheckTypeT::kLoad, StackId) to indices in TrackedLoads.
+  IndexMap LoadTrackingMap;
+  __sanitizer::InternalMmapVectorNoCtor<LoadTrackingValue> TrackedLoads;
+  mutable __sanitizer::BlockingMutex TrackedLoadsMutex;
+};
+
+extern Stats* nsan_stats;
+void initializeStats();
+
+} // namespace __nsan
+
+#endif // NSAN_STATS_H
diff --git a/compiler-rt/lib/nsan/nsan_stats.cc b/compiler-rt/lib/nsan/nsan_stats.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan_stats.cc
@@ -0,0 +1,161 @@
+//===-- nsan_stats.cc -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of NumericalStabilitySanitizer.
+//
+// NumericalStabilitySanitizer statistics.
+//===----------------------------------------------------------------------===//
+
+#include "nsan/nsan_stats.h"
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_stackdepot.h"
+#include "sanitizer_common/sanitizer_stacktrace.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
+#include "sanitizer_common/sanitizer_symbolizer.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+namespace __nsan {
+
+using namespace __sanitizer;
+
+Stats::Stats() {
+  CheckAndWarnings.Initialize(0);
+  TrackedLoads.Initialize(0);
+}
+
+Stats::~Stats() { Printf("deleting nsan stats\n"); }
+
+static uptr key(CheckTypeT CheckType, u32 StackId) {
+  return static_cast<uptr>(CheckType) +
+      StackId * static_cast<uptr>(CheckTypeT::kMaxCheckType);
+}
+
+template <typename MapT, typename VectorT, typename Fn>
+void UpdateEntry(CheckTypeT CheckTy, uptr PC, uptr BP,
+                 MapT* Map, VectorT* Vector,
+                 BlockingMutex* Mutex, Fn F) {
+  BufferedStackTrace Stack;
+  Stack.Unwind(PC, BP, nullptr, false);
+  u32 StackId = StackDepotPut(Stack);
+  typename MapT::Handle Handle(Map, key(CheckTy, StackId));
+  BlockingMutexLock Lock(Mutex);
+  if (Handle.created()) {
+    typename VectorT::value_type Entry;
+    Entry.StackId = StackId;
+    Entry.CheckTy = CheckTy;
+    F(Entry);
+    Vector->push_back(Entry);
+  } else {
+    auto& Entry = (*Vector)[*Handle];
+    F(Entry);
+  }
+}
+
+void Stats::addCheck(CheckTypeT CheckTy, uptr PC, uptr BP, double RelErr) {
+  UpdateEntry(CheckTy, PC, BP, &CheckAndWarningsMap, &CheckAndWarnings, &CheckAndWarningsMutex, [RelErr](CheckAndWarningsValue& Entry) {
+    ++Entry.NumChecks;
+    if (RelErr > Entry.MaxRelativeError) {
+      Entry.MaxRelativeError = RelErr;
+    }
+  });
+}
+
+void Stats::addWarning(CheckTypeT CheckTy, uptr PC, uptr BP, double RelErr) {
+  UpdateEntry(CheckTy, PC, BP, &CheckAndWarningsMap, &CheckAndWarnings, &CheckAndWarningsMutex, [RelErr](CheckAndWarningsValue& Entry) {
+    ++Entry.NumWarnings;
+    if (RelErr > Entry.MaxRelativeError) {
+      Entry.MaxRelativeError = RelErr;
+    }
+  });
+}
+
+void Stats::addInvalidLoadTrackingEvent(uptr PC, uptr BP) {
+  UpdateEntry(CheckTypeT::kLoad, PC, BP, &LoadTrackingMap, &TrackedLoads, &TrackedLoadsMutex, [](LoadTrackingValue& Entry) {
+    ++Entry.NumInvalid;
+  });
+}
+
+void Stats::addUnknownLoadTrackingEvent(uptr PC, uptr BP) {
+  UpdateEntry(CheckTypeT::kLoad, PC, BP, &LoadTrackingMap, &TrackedLoads, &TrackedLoadsMutex, [](LoadTrackingValue& Entry) {
+    ++Entry.NumUnknown;
+  });
+}
+
+static const char *CheckTypeDisplay(CheckTypeT CheckType) {
+  switch (CheckType) {
+  case CheckTypeT::kUnknown:
+    return "unknown";
+  case CheckTypeT::kRet:
+    return "return";
+  case CheckTypeT::kArg:
+    return "argument";
+  case CheckTypeT::kLoad:
+    return "load";
+  case CheckTypeT::kStore:
+    return "store";
+  case CheckTypeT::kInsert:
+    return "vector insert";
+  case CheckTypeT::kUser:
+    return "user-initiated";
+  case CheckTypeT::kFcmp:
+    return "fcmp";
+  case CheckTypeT::kMaxCheckType:
+    return "[max]";
+  }
+  assert(false && "unknown CheckType case");
+  return "";
+}
+
+void Stats::print() const {
+  {
+    BlockingMutexLock Lock(&CheckAndWarningsMutex);
+    for (const auto &Entry : CheckAndWarnings) {
+      Printf("warned %llu times out of %llu %s checks ",
+             Entry.NumWarnings, Entry.NumChecks,
+             CheckTypeDisplay(Entry.CheckTy));
+      if (Entry.NumWarnings > 0) {
+        char RelErrBuf[64];
+        snprintf(RelErrBuf, sizeof(RelErrBuf) - 1, "%f",
+                 Entry.MaxRelativeError * 100.0);
+        Printf("(max relative error: %s%%) ", RelErrBuf);
+      }
+      Printf("at:\n");
+      StackDepotGet(Entry.StackId).Print();
+    }
+  }
+
+  {
+    BlockingMutexLock Lock(&TrackedLoadsMutex);
+    u64 TotalInvalidLoadTracking = 0;
+    u64 TotalUnknownLoadTracking = 0;
+    for (const auto &Entry : TrackedLoads) {
+      TotalInvalidLoadTracking += Entry.NumInvalid;
+      TotalUnknownLoadTracking += Entry.NumUnknown;
+      Printf("invalid/unknown type for %llu/%llu loads at:\n",
+             Entry.NumInvalid, Entry.NumUnknown);
+      StackDepotGet(Entry.StackId).Print();
+    }
+    Printf(
+        "There were %llu/%llu floating-point loads where the shadow type was "
+        "invalid/unknown.\n",
+        TotalInvalidLoadTracking, TotalUnknownLoadTracking);
+  }
+}
+
+
+ALIGNED(64) static char StatsPlaceholder[sizeof(Stats)];
+Stats* nsan_stats = nullptr;
+
+void initializeStats() {
+  nsan_stats = new (StatsPlaceholder)Stats();
+}
+
+} // namespace __nsan
diff --git a/compiler-rt/lib/nsan/nsan_suppressions.h b/compiler-rt/lib/nsan/nsan_suppressions.h
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan_suppressions.h
@@ -0,0 +1,31 @@
+//===-- nsan_suppressions.h -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines nsan suppression rules.
+//===----------------------------------------------------------------------===//
+
+#ifndef NSAN_SUPPRESSIONS_H
+#define NSAN_SUPPRESSIONS_H
+
+#include "sanitizer_common/sanitizer_suppressions.h"
+
+namespace __nsan {
+
+extern const char* const kSuppressionNone;
+extern const char* const kSuppressionFcmp;
+extern const char* const kSuppressionConsistency;
+
+void InitializeSuppressions();
+
+__sanitizer::Suppression *
+GetSuppressionForStack(const __sanitizer::StackTrace *Stack,
+                       const char *SupprType);
+
+} // namespace __nsan
+
+#endif
diff --git a/compiler-rt/lib/nsan/nsan_suppressions.cc b/compiler-rt/lib/nsan/nsan_suppressions.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/nsan_suppressions.cc
@@ -0,0 +1,76 @@
+//===-- nsan_suppressions.cc ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "nsan_suppressions.h"
+
+#include "sanitizer_common/sanitizer_placement_new.h"
+#include "sanitizer_common/sanitizer_stacktrace.h"
+#include "sanitizer_common/sanitizer_symbolizer.h"
+
+#include "nsan_flags.h"
+
+// Can be overriden in frontend.
+SANITIZER_WEAK_DEFAULT_IMPL
+const char *__nsan_default_suppressions() { return 0; }
+
+namespace __nsan {
+
+const char* const kSuppressionNone = "none";
+const char* const kSuppressionFcmp = "fcmp";
+const char* const kSuppressionConsistency = "consistency";
+
+using namespace __sanitizer;
+
+ALIGNED(64) static char SuppressionPlaceholder[sizeof(SuppressionContext)];
+static SuppressionContext *SuppressionCtx = nullptr;
+static const char *kSuppressionTypes[] = {kSuppressionFcmp,
+                                          kSuppressionConsistency};
+
+void InitializeSuppressions() {
+  CHECK_EQ(nullptr, SuppressionCtx);
+  SuppressionCtx = new (SuppressionPlaceholder)
+      SuppressionContext(kSuppressionTypes, ARRAY_SIZE(kSuppressionTypes));
+  SuppressionCtx->ParseFromFile(flags().suppressions);
+  SuppressionCtx->Parse(__nsan_default_suppressions());
+}
+
+static Suppression *GetSuppressionForAddr(uptr Addr, const char *SupprType) {
+  Suppression *S = nullptr;
+
+  // Suppress by module name.
+  SuppressionContext *Suppressions = SuppressionCtx;
+  if (const char *ModuleName =
+          Symbolizer::GetOrInit()->GetModuleNameForPc(Addr)) {
+    if (Suppressions->Match(ModuleName, SupprType, &S))
+      return S;
+  }
+
+  // Suppress by file or function name.
+  SymbolizedStack *Frames = Symbolizer::GetOrInit()->SymbolizePC(Addr);
+  for (SymbolizedStack *Cur = Frames; Cur; Cur = Cur->next) {
+    if (Suppressions->Match(Cur->info.function, SupprType, &S) ||
+        Suppressions->Match(Cur->info.file, SupprType, &S)) {
+      break;
+    }
+  }
+  Frames->ClearAll();
+  return S;
+}
+
+Suppression *GetSuppressionForStack(const StackTrace *Stack,
+                                    const char *SupprType) {
+  for (uptr I = 0, E = Stack->size; I < E; I++) {
+    Suppression *S = GetSuppressionForAddr(
+        StackTrace::GetPreviousInstructionPc(Stack->trace[I]), SupprType);
+    if (S)
+      return S;
+  }
+  return nullptr;
+}
+
+} // end namespace __nsan
diff --git a/compiler-rt/lib/nsan/tests/CMakeLists.txt b/compiler-rt/lib/nsan/tests/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/tests/CMakeLists.txt
@@ -0,0 +1,50 @@
+include(CompilerRTCompile)
+
+set(NSAN_UNITTEST_CFLAGS
+  ${COMPILER_RT_UNITTEST_CFLAGS}
+  ${COMPILER_RT_GTEST_CFLAGS}
+  -I${COMPILER_RT_SOURCE_DIR}/lib/
+  -O2
+  -g
+  -fno-omit-frame-pointer)
+
+file(GLOB NSAN_HEADERS ../*.h)
+set(NSAN_UNITTESTS
+	NSanUnitTest.cpp)
+
+add_custom_target(NsanUnitTests)
+set_target_properties(NsanUnitTests PROPERTIES FOLDER "Compiler-RT Tests")
+
+# set(NSAN_UNITTEST_LINK_FLAGS ${COMPILER_RT_UNITTEST_LINK_FLAGS} -ldl)
+# list(APPEND NSAN_UNITTEST_LINK_FLAGS --driver-mode=g++)
+
+if(COMPILER_RT_DEFAULT_TARGET_ARCH IN_LIST NSAN_SUPPORTED_ARCH)
+  # NSan unit tests are only run on the host machine.
+  set(arch ${COMPILER_RT_DEFAULT_TARGET_ARCH})
+
+  set(NSAN_TEST_RUNTIME RTNsanTest.${arch})
+
+  set(NSAN_TEST_RUNTIME_OBJECTS
+    $<TARGET_OBJECTS:RTNsan.${arch}>
+    $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+    $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+    $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>)
+
+  add_library(${NSAN_TEST_RUNTIME} STATIC
+    ${NSAN_TEST_RUNTIME_OBJECTS})
+
+  set_target_properties(${NSAN_TEST_RUNTIME} PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    FOLDER "Compiler-RT Runtime tests")
+
+  set(NsanTestObjects)
+  generate_compiler_rt_tests(NsanTestObjects
+    NsanUnitTests "Nsan-${arch}-Test" ${arch}
+    SOURCES ${NSAN_UNITTESTS} ${COMPILER_RT_GTEST_SOURCE}
+    RUNTIME ${NSAN_TEST_RUNTIME}
+    DEPS gtest ${NSAN_UNIT_TEST_HEADERS}
+    CFLAGS ${NSAN_UNITTEST_CFLAGS}
+    LINK_FLAGS ${NSAN_UNITTEST_LINK_FLAGS})
+  set_target_properties(NsanUnitTests PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
diff --git a/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp b/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp
@@ -0,0 +1,67 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Do not attempt to use LLVM ostream etc from gtest.
+#define GTEST_NO_LLVM_SUPPORT 1
+
+#include "nsan.h"
+#include "gtest/gtest.h"
+
+#include <cmath>
+
+namespace __nsan {
+
+template <typename FT, auto next> void TestFT() {
+  // Basic local tests anchored at 0.0.
+  ASSERT_EQ(getULPDiff<FT>(0.0, 0.0), 0);
+  ASSERT_EQ(getULPDiff<FT>(-0.0, 0.0), 0);
+  ASSERT_EQ(getULPDiff<FT>(next(-0.0, -1.0), 0.0), 1);
+  ASSERT_EQ(getULPDiff<FT>(next(0.0, 1.0), -0.0), 1);
+  ASSERT_EQ(getULPDiff<FT>(next(-0.0, -1.0), next(0.0, 1.0)), 2);
+  // Basic local tests anchored at 2.0.
+  ASSERT_EQ(getULPDiff<FT>(next(2.0, 1.0), 2.0), 1);
+  ASSERT_EQ(getULPDiff<FT>(next(2.0, 3.0), 2.0), 1);
+  ASSERT_EQ(getULPDiff<FT>(next(2.0, 1.0), next(2.0, 3.0)), 2);
+
+  ASSERT_NE(getULPDiff<FT>(-0.01, 0.01), kMaxULPDiff);
+
+  // Basic local tests anchored at a random number.
+  const FT X = 4863.5123;
+  const FT To = 2 * X;
+  FT Y = X;
+  ASSERT_EQ(getULPDiff<FT>(X, Y), 0);
+  ASSERT_EQ(getULPDiff<FT>(-X, -Y), 0);
+  Y = next(Y, To);
+  ASSERT_EQ(getULPDiff<FT>(X, Y), 1);
+  ASSERT_EQ(getULPDiff<FT>(-X, -Y), 1);
+  Y = next(Y, To);
+  ASSERT_EQ(getULPDiff<FT>(X, Y), 2);
+  ASSERT_EQ(getULPDiff<FT>(-X, -Y), 2);
+  Y = next(Y, To);
+  ASSERT_EQ(getULPDiff<FT>(X, Y), 3);
+  ASSERT_EQ(getULPDiff<FT>(-X, -Y), 3);
+
+  // Values with larger differences.
+  static constexpr const __sanitizer::u64 MantissaSize =
+      __sanitizer::u64{1} << FTInfo<FT>::kMantissaBits;
+  ASSERT_EQ(getULPDiff<FT>(1.0, next(2.0, 1.0)), MantissaSize - 1);
+  ASSERT_EQ(getULPDiff<FT>(1.0, 2.0), MantissaSize);
+  ASSERT_EQ(getULPDiff<FT>(1.0, next(2.0, 3.0)), MantissaSize + 1);
+  ASSERT_EQ(getULPDiff<FT>(1.0, 3.0), (3 * MantissaSize) / 2);
+}
+
+TEST(NSanTest, Float) { TestFT<float, nextafterf>(); }
+
+TEST(NSanTest, Double) {
+  TestFT<double, static_cast<double (*)(double, double)>(nextafter)>();
+}
+
+TEST(NSanTest, Float128) {
+  // Very basic tests. FIXME: improve when we have nextafter<__float128>.
+  ASSERT_EQ(getULPDiff<__float128>(0.0, 0.0), 0);
+  ASSERT_EQ(getULPDiff<__float128>(-0.0, 0.0), 0);
+  ASSERT_NE(getULPDiff<__float128>(-0.01, 0.01), kMaxULPDiff);
+}
+
+}  // end namespace __nsan
diff --git a/compiler-rt/test/nsan/CMakeLists.txt b/compiler-rt/test/nsan/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/CMakeLists.txt
@@ -0,0 +1,32 @@
+set(NSAN_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(NSAN_LIT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+set(NSAN_TESTSUITES)
+
+set(NSAN_UNITTEST_DEPS)
+set(NSAN_TEST_DEPS
+  ${SANITIZER_COMMON_LIT_TEST_DEPS}
+  nsan)
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  )
+
+foreach(arch ${NSAN_SUPPORTED_ARCH})
+  set(NSAN_TEST_TARGET_ARCH ${arch})
+  string(TOLOWER "-${arch}" NSAN_TEST_CONFIG_SUFFIX)
+  get_test_cc_for_arch(${arch} NSAN_TEST_TARGET_CC NSAN_TEST_TARGET_CFLAGS)
+  string(TOUPPER ${arch} ARCH_UPPER_CASE)
+  set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py)
+  list(APPEND NSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
+endforeach()
+
+add_lit_testsuite(check-nsan "Running the NSan tests"
+  ${NSAN_TESTSUITES}
+  DEPENDS ${NSAN_TEST_DEPS})
+set_target_properties(check-nsan PROPERTIES FOLDER "Compiler-RT Misc")
diff --git a/compiler-rt/test/nsan/alloca.cc b/compiler-rt/test/nsan/alloca.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/alloca.cc
@@ -0,0 +1,22 @@
+// RUN: %clangxx_nsan -O2 -g %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <cstddef>
+
+#include "helpers.h"
+
+extern "C" void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes, size_t bytes_per_line, size_t reserved);
+
+int main() {
+  int size = 3 * sizeof(float);
+  // Make sure we allocate dynamically: https://godbolt.org/z/T3h998.
+  DoNotOptimize(size);
+  float* array = reinterpret_cast<float*>(__builtin_alloca(size));
+  DoNotOptimize(array);
+  array[0] = 1.0;
+  array[1] = 2.0;
+  // The third float is uninitialized.
+  __nsan_dump_shadow_mem((const char*)array, 3 * sizeof(float), 16, 0);
+  // CHECK: {{.*}} f0 f1 f2 f3 f0 f1 f2 f3 __ __ __ __ (1.00000000000000000000)  (2.00000000000000000000)
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cadna_ex1.cc b/compiler-rt/test/nsan/cadna_ex1.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cadna_ex1.cc
@@ -0,0 +1,20 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=0 %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// http://cadna.lip6.fr/Examples_Dir/ex1.php
+// This checks that nsan can detect basic cancellations.
+
+#include <cstdio>
+
+// Adapted from Fortran: http://cadna.lip6.fr/Examples_Dir/source/ex1.f
+__attribute__((noinline)) void Ex1(double x, double y) {
+  printf("P(%f,%f) = %f\n", x, y, 9.0*x*x*x*x - y*y*y*y + 2.0*y*y);
+  // CHECK: #0 {{.*}} in Ex1{{.*}}[[@LINE-1]]
+}
+
+int main() {
+  Ex1(10864.0, 18817.0);
+  // CHECK: #1 {{.*}} in main{{.*}}[[@LINE-1]]
+  Ex1(1.0 / 3, 2.0 / 3);
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cadna_ex2.cc b/compiler-rt/test/nsan/cadna_ex2.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cadna_ex2.cc
@@ -0,0 +1,52 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// http://cadna.lip6.fr/Examples_Dir/ex2.php
+// This is an example where nsan fail to detect an issue. Doing the computations
+// in quad instead of double precision does not help in detecting that the
+// computation of the determinant is unstable: both double and quad precision
+// find it to be positive.
+
+#include <cmath>
+#include <cstdio>
+
+extern "C" void __nsan_dump_double(double value);
+
+// Adapted from Fortran: http://cadna.lip6.fr/Examples_Dir/source/ex2.f
+__attribute__((noinline)) void Solve(double a, double b, double c) {
+  if (a == 0) {
+    if (b == 0) {
+      if (c == 0) {
+        printf("Every complex value is solution.\n");
+      } else {
+        printf("There is no solution.\n");
+      }
+    } else {
+      double x1 = -c / b;
+      printf("'The equation is degenerated. There is one real solution: %f\n",
+             x1);
+    }
+  } else {
+    b = b / a;
+    c = c / a;
+    double d = b * b - 4.0 * c;
+    __nsan_dump_double(d);  // Print the discriminant shadow value.
+    if (d == 0.0) {
+      double x1 = -b * 0.5;
+      printf("Discriminant is zero. The double solution is %f\n", x1);
+    } else if (d > 0) {
+      double x1 = (-b - sqrt(d)) * 0.5;
+      double x2 = (-b + sqrt(d)) * 0.5;
+      printf("There are two real solutions. x1 = %f x2 = %f\n", x1, x2);
+    } else {
+      double x1 = -b * 0.5;
+      double x2 = sqrt(-d) * 0.5;
+      printf("There are two complex solutions. z1 = %f %f z2 = %f %f\n", x1, x2,
+             x1, -x2);
+    }
+  }
+}
+
+int main() {
+  Solve(0.3, - 2.1, 3.675);
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cadna_ex3.cc b/compiler-rt/test/nsan/cadna_ex3.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cadna_ex3.cc
@@ -0,0 +1,50 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// http://cadna.lip6.fr/Examples_Dir/ex3.php
+// The determinant of Hilbert's matrix (11x11) without pivoting strategy is
+// computed. After triangularization, the determinant is the product of the
+// diagonal elements.
+// Although the algorithm suffers from loss of precision, it is stable, and
+// nsan does not warn.
+
+#include <cstdio>
+
+// Adapted from Fortran: http://cadna.lip6.fr/Examples_Dir/source/ex3.f
+int main() {
+  constexpr const int kN = 11;
+  double amat[kN][kN];
+  for (int i = 0; i < kN; ++i) {
+    for (int j = 0; j < kN; ++j) {
+      // Hilbert's matrix is defined by: a(i,j) = 1/(i+j+1),
+      // where i and j are zero-based.
+      amat[i][j] = 1.0 / (i + j + 1);
+      printf("%.3f, ", amat[i][j]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+
+  double det = 1.0;
+  for (int i = 0; i < kN - 1; ++i) {
+    printf("Pivot number %2i = %f\n", i, amat[i][i]);
+    det = det * amat[i][i];
+    const double aux = 1.0 / amat[i][i];
+    for (int j = i + 1; j < kN; ++j) {
+      amat[i][j] = amat[i][j] * aux;
+    }
+
+    for (int j = i + 1; j < kN; ++j) {
+      const double aux = amat[j][i];
+      for (int k = i + 1; k < kN; ++k) {
+        amat[j][k] = amat[j][k] - aux * amat[i][k];
+      }
+    }
+  }
+
+  constexpr const int kLastElem = kN-1;
+  const double last_pivot = amat[kLastElem][kLastElem];
+  printf("Pivot number %2i = %f\n", kLastElem, last_pivot);
+  det = det * last_pivot;
+  printf("Determinant     = %.12g\n", det);
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cadna_ex4.cc b/compiler-rt/test/nsan/cadna_ex4.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cadna_ex4.cc
@@ -0,0 +1,37 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// http://cadna.lip6.fr/Examples_Dir/ex4.php
+// This example was proposed by J.-M. Muller [1]. The 25 first iterations of the
+// following recurrent sequence are computed:
+//   U(n+1) = 111 - 1130/U(n) + 3000/(U(n)*U(n-1))
+// with U(0) = 5.5 and U(1) = 61/11.
+// The exact value for the limit is 6.
+// [1] J.-M. Muller, "Arithmetique des ordinateurs", Ed. Masson, 1987.
+//
+// This checks that nsan correctly detects the instability.
+
+
+#include <cstdio>
+
+// Adapted from Fortran: http://cadna.lip6.fr/Examples_Dir/source/ex4.f
+__attribute__((noinline))  // Prevent constant folding.
+void
+Ex4(double u_n_minus_1, double u_n, const int end_iter) {
+  for (int i = 3; i < end_iter; ++i) {
+    const double u_n_plus_1 =
+        111.0 - 1130.0 / u_n + 3000.0 / (u_n * u_n_minus_1);
+    u_n_minus_1 = u_n;
+    u_n = u_n_plus_1;
+    printf("U(%i) = %f\n", i, u_n);
+// CHECK: #0{{.*}}in Ex4{{.*}}cadna_ex4.cc:[[@LINE-1]]
+  }
+}
+
+int main() {
+  constexpr const double kU1 = 5.5;
+  constexpr const double kU2 = 61.0 / 11.0;
+  constexpr const double kEndIter = 25;
+  Ex4(kU1, kU2, kEndIter);
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cadna_ex5.cc b/compiler-rt/test/nsan/cadna_ex5.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cadna_ex5.cc
@@ -0,0 +1,95 @@
+// RUN: %clangxx_nsan -O0 -DFN=Unstable -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefix=UNSTABLE < %t.out
+
+// RUN: %clangxx_nsan -O2 -DFN=Unstable -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefix=UNSTABLE < %t.out
+
+// RUN: %clangxx_nsan -O0 -DFN=StableRel -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// RUN: %clangxx_nsan -O2 -DFN=StableRel -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// RUN: %clangxx_nsan -O0 -DFN=StableEq -mllvm -nsan-truncate-fcmp-eq=true -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// RUN: %clangxx_nsan -O2 -DFN=StableEq -mllvm -nsan-truncate-fcmp-eq=true -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// RUN: %clangxx_nsan -O0 -DFN=StableEq -mllvm -nsan-truncate-fcmp-eq=false -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefix=STABLEEQ-NOTRUNCATE < %t.out
+
+// RUN: %clangxx_nsan -O2 -DFN=StableEq -mllvm -nsan-truncate-fcmp-eq=false -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefix=STABLEEQ-NOTRUNCATE < %t.out
+
+
+// http://cadna.lip6.fr/Examples_Dir/ex5.php
+// This program computes a root of the polynomial
+//   f(x) = 1.47*x**3 + 1.19*x**2 - 1.83*x + 0.45
+// using Newton's method.
+// The sequence is initialized by x = 0.5.
+// The iterative algorithm `x(n+1) = x(n) - f(x(n))/f'(x(n))` is stopped by the
+// criterion |x(n)-x(n-1)|<=1.0e-12.
+//
+// The first algorithm is inherently unstable, this checks that nsan detects the
+// issue with the unstable code and does not trigger on the stabilized version.
+
+#include <cmath>
+#include <cstdio>
+
+constexpr const double kEpsilon = 1e-12;
+constexpr const double kNMax = 100;
+
+// The unstable version.
+// Adapted from Fortran: http://cadna.lip6.fr/Examples_Dir/source/ex5.f
+__attribute__((noinline))  // Prevent constant folding.
+void Unstable(double y) {
+  double x;
+  int i;
+  for (i = 1; i < kNMax; ++i) {
+    x = y;
+    y = x - (1.47 * x * x * x + 1.19 * x * x - 1.83 * x + 0.45) /
+                (4.41 * x * x + 2.38 * x - 1.83);
+    if (fabs(x - y) < kEpsilon) break;
+// UNSTABLE: #0{{.*}}in Unstable{{.*}}cadna_ex5.cc:[[@LINE-1]]
+  }
+
+  printf("x(%i) = %g\n", i - 1, x);
+  printf("x(%i) = %g\n", i, y);
+}
+
+// The stabilized version, where the termination criterion is an equality
+// comparison. The equality is considered unstable or not by nsan depending on
+// the value of --nsan-truncate-fcmp-eq.
+// Adapted from Fortran: http://cadna.lip6.fr/Examples_Dir/source/ex5_cad_opt.f
+__attribute__((noinline))  // Prevent constant folding.
+void StableEq(double y) {
+  double x;
+  int i;
+  for (i = 1; i < kNMax; ++i) {
+    x = y;
+    y = ((4.2*x + 3.5)*x + 1.5)/(6.3*x + 6.1);
+    if (x == y) break;
+// STABLEEQ-NOTRUNCATE: #0{{.*}}in StableEq{{.*}}cadna_ex5.cc:[[@LINE-1]]
+  }
+
+  printf("x(%i) = %g\n", i - 1, x);
+  printf("x(%i) = %g\n", i, y);
+}
+
+// The stabilized version, where the termination criterion is a relative
+// comparison. This is a more stable fix of `Unstable`.
+__attribute__((noinline))  // Prevent constant folding.
+void StableRel(double y) {
+  double x;
+  int i;
+  for (i = 1; i < kNMax; ++i) {
+    x = y;
+    y = ((4.2*x + 3.5)*x + 1.5)/(6.3*x + 6.1);
+    if (fabs(x - y) < kEpsilon) break;
+  }
+
+  printf("x(%i) = %g\n", i - 1, x);
+  printf("x(%i) = %g\n", i, y);
+}
+
+int main() {
+  FN(0.5);
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cadna_ex6.cc b/compiler-rt/test/nsan/cadna_ex6.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cadna_ex6.cc
@@ -0,0 +1,67 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t
+
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t
+
+// http://cadna.lip6.fr/Examples_Dir/ex6.php
+// The following linear system is solved with the Gaussian elimination method
+// with partial pivoting.
+//
+// This test checks that nsan detects the instability.
+
+#include <cmath>
+#include <cstdio>
+
+// Adapted from Fortran: http://cadna.lip6.fr/Examples_Dir/source/ex6.f
+int main() {
+  constexpr const int kDim = 4;
+  constexpr const int kDim1 = 5;
+
+  float xsol[kDim] = {1.0, 1.0, 1.e-8, 1.0};
+  float a[kDim][kDim1] = {
+      {21.0, 130.0, 0.0, 2.1, 153.1},
+      {13.0, 80.0, 4.74e+8, 752.0, 849.74},
+      {0.0, -0.4, 3.9816e+8, 4.2, 7.7816},
+      {0.0, 0.0, 1.7, 9.0e-9, 2.6e-8},
+  };
+
+  for (int i = 0; i < kDim - 1; ++i) {
+    float pmax = 0.0 ;
+    int ll;
+    for (int j = i; j < kDim; ++j) {
+      const float a_j_i = a[j][i];
+      if (fabsf(a_j_i) > pmax) {
+        pmax = abs(a_j_i);
+        ll = j;
+      }
+    }
+
+    if (ll != i) {
+      for (int j = i; j < kDim1; ++j) {
+        std::swap(a[i][j], a[ll][j]);
+      }
+    }
+
+    const float a_i_i = a[i][i];
+    for (int j = i + 1; j < kDim1; ++j) {
+      a[i][j] = a[i][j] / a_i_i;
+    }
+
+    for (int k = i + 1; k < kDim; ++k) {
+      const float a_k_i = a[k][i];
+      for (int j = i + 1; j < kDim1; ++j) {
+        a[k][j] = a[k][j] - a_k_i * a[i][j];
+      }
+    }
+  }
+
+  a[kDim - 1][kDim1 - 1] = a[kDim - 1][kDim1 - 1] / a[kDim - 1][kDim - 1];
+  for (int i = kDim - 2; i >= 0; --i) {
+    for (int j = i + 1; j < kDim; ++j) {
+      a[i][kDim1 - 1] = a[i][kDim1 - 1] - a[i][j] * a[j][kDim1 - 1];
+    }
+  }
+  for (int i = 0; i < kDim; ++i) {
+    printf("x_sol[%i] = %g (true value : %g)\n", i, a[i][kDim1 - 1], xsol[i]);
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cadna_ex7.cc b/compiler-rt/test/nsan/cadna_ex7.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cadna_ex7.cc
@@ -0,0 +1,110 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=0,log2_max_relative_error=0 %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STOP %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=0,log2_max_relative_error=0 %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STOP %s < %t.out
+
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=REL %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=REL %s < %t.out
+
+// http://cadna.lip6.fr/Examples_Dir/ex7.php
+// This program solves a linear system of order 20 by using Jacobi's method.
+// The stopping criterion is
+//   || X(n+1) - X(n) || <= eps
+// where ||X|| is the maximum norm and eps=0.0001.
+//
+// This tests that nsan catches two types of errors:
+//  - The first one is that the stopping criterion is not stable w.r.t. the
+//    precision (STOP). To show this we disable relative error
+//    checking and only let the fcmp checker detect the unstable branching.
+//  - The second one is that the computations are unstable anyway from the first
+//    iteration (REL).
+
+#include <cmath>
+#include <cstdio>
+
+// Adapted from Fortran: http://cadna.lip6.fr/Examples_Dir/source/ex7.f
+
+float random1() {
+  static int nrand = 23;
+  nrand = (nrand * 5363 + 143) % 1387;
+  return 2.0 * nrand / 1387.0 - 1.0;
+}
+
+int main() {
+  constexpr const float kEpsilon = 1e-4;
+  constexpr const int kNDims = 20;
+  constexpr const int kNIters = 1000;
+
+  float a[kNDims][kNDims];
+  float b[kNDims];
+  float x[kNDims];
+  float y[kNDims];
+  const float xsol[kNDims] = {
+      1.7,    -4746.89, 50.23, -245.32,  4778.29,  -75.73,  3495.43,
+      4.35,   452.98,   -2.76, 8239.24,  3.46,     1000.0,  -5.0,
+      3642.4, 735.36,   1.7,   -2349.17, -8247.52, 9843.57,
+  };
+
+  for (int i = 0; i < kNDims; ++i) {
+    for (int j = 0; j < kNDims; ++j) {
+      a[i][j] = random1();
+    }
+    a[i][i] = a[i][i] + 4.9213648f;
+  }
+
+  for (int i = 0; i < kNDims; ++i) {
+    float aux = 0.0f;
+    for (int j = 0; j < kNDims; ++j) {
+      aux = aux + a[i][j]*xsol[j];
+    }
+    b[i] = aux;
+    y[i] = 10.0f;
+  }
+
+  int iter = 0;
+  for (iter = 0; iter < kNIters; ++iter) {
+    float anorm = 0.0f;
+    for (int j = 0; j < kNDims; ++j) {
+      x[j] = y[j];
+    }
+    for (int j = 0; j < kNDims; ++j) {
+      float aux = b[j];
+      for (int k = 0; k < kNDims; ++k) {
+        if (k != j) {
+          aux = aux - a[j][k]*x[k];
+        }
+      }
+// REL: WARNING: NumericalStabilitySanitizer: inconsistent shadow
+// Note: We are not checking the line because nsan detects the issue at the
+// `y[j]=` store location in dbg mode, and at the `abs()` location in release
+// because the store is optimized out.
+      y[j] = aux / a[j][j];
+
+// STOP: WARNING: NumericalStabilitySanitizer: floating-point comparison results depend on precision
+// STOP: #0{{.*}}in main{{.*}}cadna_ex7.cc:[[@LINE+1]]
+      if (fabsf(x[j]-y[j]) > anorm) {
+        anorm = fabsf(x[j]-y[j]);
+      }
+    }
+    printf("iter = %i\n", iter);
+// STOP: WARNING: NumericalStabilitySanitizer: floating-point comparison results depend on precision
+// STOP: #0{{.*}}in main{{.*}}cadna_ex7.cc:[[@LINE+1]]
+    if (anorm < kEpsilon) break;
+  }
+
+  printf("niter = %i\n", iter);
+  for (int i = 0; i < kNDims; ++i) {
+    float aux = -b[i];
+    for (int j = 0; j < kNDims; ++j) {
+      aux = aux + a[i][j]*y[j];
+    }
+    printf("x_sol(%2i) = %15.7f (true value : %15.7f), residue(%2i) = %15.7f\n",
+           i, y[i], xsol[i], i, aux);
+  }
+
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cancellation_fn_ptr.cc b/compiler-rt/test/nsan/cancellation_fn_ptr.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cancellation_fn_ptr.cc
@@ -0,0 +1,65 @@
+// RUN: %clangxx_nsan  -O0 -g -DFN=Cube %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan  -O1 -g -DFN=Cube %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan  -O2 -g -DFN=Cube %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O0 -g -DFN=Square %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=Square %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O0 -g -DFN=Inverse %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=Inverse %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+
+// Computes the derivative of x -> fn(x) using a finite difference
+// approximation:
+//     f'(a) = (f(a + da) - f(a)) / da
+// https://en.wikipedia.org/wiki/Numerical_differentiation#Finite_differences
+// Numerical differentiation is a is a well known case of numerical instability.
+// It typically leads to cancellation errors and division issues as `da`
+// approaches zero.
+// This is similar to `cancellation_libm.cc`, but this variant uses a function
+// pointer to a user-defined function instead of a libm function.
+
+#include <cstdio>
+#include <cmath>
+#define xstr(s) str(s)
+#define str(s) #s
+
+static float Square(float x) {
+  return x * x;
+}
+
+static float Cube(float x) {
+  return x * x * x;
+}
+
+static float Inverse(float x) {
+  return 1.0f / x;
+}
+
+__attribute__((noinline))  // To check call stack reporting.
+float ComputeDerivative(float(*fn)(float), float a, float da) {
+  return (fn(a + da) - fn(a)) / da;
+  // CHECK: WARNING: NumericalStabilitySanitizer: inconsistent shadow results while checking return
+  // CHECK: float {{ *}}precision (native):
+  // CHECK: double{{ *}}precision (shadow):
+  // CHECK: {{#0 .*in ComputeDerivative}}
+}
+
+int main() {
+  for (int i = 7; i < 31; ++i) {
+    float step = 1.0f / (1ull << i);
+    printf("%s derivative: %.8f\n", xstr(FN), ComputeDerivative(&FN, 0.1f, step));
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cancellation_libm.cc b/compiler-rt/test/nsan/cancellation_libm.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cancellation_libm.cc
@@ -0,0 +1,51 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O1 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+
+// NOTE: -fno-math-errno allows clang to emit an intrinsic.
+
+// RUN: %clangxx_nsan -O0 -g %s -o %t -fno-math-errno && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O1 -g %s -o %t -fno-math-errno && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g0 %s -o %t -fno-math-errno && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// Computes the derivative of x -> expf(x) using a finite difference
+// aproximation:
+//     f'(a) = (f(a + da) - f(a)) / da
+// https://en.wikipedia.org/wiki/Numerical_differentiation#Finite_differences
+// Numerical differentiation is a is a well known case of numerical instability.
+// It typically leads to cancellation errors and division issues as `da`
+// approaches zero.
+
+#include <cstdio>
+#include <cmath>
+
+// Note that expf is not instrumented, so we cannot detect the numerical
+// discrepancy if we do not recognize intrinsics.
+__attribute__((noinline))  // To check call stack reporting.
+float ComputeDerivative(float a, float da) {
+  return (expf(a + da) - expf(a)) / da;
+  // CHECK: WARNING: NumericalStabilitySanitizer: inconsistent shadow results while checking return
+  // CHECK: float {{ *}}precision (native):
+  // CHECK: double{{ *}}precision (shadow):
+  // CHECK: {{#0 .*in ComputeDerivative}}
+}
+
+int main() {
+  for (int i = 1; i < 31; ++i) {
+    const float step = 1.0f / (1ull << i);
+    printf("derivative (step %f):\n", step);
+    printf("    %.8f\n", ComputeDerivative(0.1f, step));
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/cancellation_ok.cc b/compiler-rt/test/nsan/cancellation_ok.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/cancellation_ok.cc
@@ -0,0 +1,53 @@
+// RUN: %clangxx_nsan -O0 -g -DIMPL=Naive -mllvm -nsan-instrument-fcmp=0 %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+// RUN: %clangxx_nsan -O2 -g -DIMPL=Naive -mllvm -nsan-instrument-fcmp=0 %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+// RUN: %clangxx_nsan -O0 -g -DIMPL=Better1 -mllvm -nsan-instrument-fcmp=0 %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+// RUN: %clangxx_nsan -O2 -g -DIMPL=Better1 -mllvm -nsan-instrument-fcmp=0 %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+// RUN: %clangxx_nsan -O0 -g -DIMPL=Better2 -mllvm -nsan-instrument-fcmp=0 %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+// RUN: %clangxx_nsan -O2 -g -DIMPL=Better2 -mllvm -nsan-instrument-fcmp=0 %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// This tests a few cancellations from the implementations of the function
+// presented in: https://people.eecs.berkeley.edu/~wkahan/JAVAhurt.pdf, page 27.
+// All three functions have varying degrees of cancellation, none of which
+// lead to catastrophic errors.
+
+
+#include <cmath>
+#include <cstdio>
+
+// This never loses more than 1/2 of the digits.
+static double Naive(const double X) __attribute__((noinline)) {
+  double Y, Z;
+  Y = X - 1.0;
+  Z = exp(Y);
+  if (Z != 1.0)
+    Z = Y / (Z - 1.0);
+  return Z;
+}
+
+static double Better1(const double X) __attribute__((noinline)) {
+  long double Y, Z;
+  Y = X - 1.0;
+  Z = exp(Y);
+  if (Z != 1.0)
+    Z = Y / (Z - 1.0);
+  return Z;
+}
+
+// This is precise to a a few ulps.
+static double Better2(const double X) __attribute__((noinline)) {
+  double Y, Z;
+  Y = X - 1.0;
+  Z = exp(Y);
+  if (Z != 1.0)
+    Z = log(Z) / (Z - 1.0);
+  return Z;
+}
+
+int main() {
+  for (int i = 7; i < 31; ++i) {
+    const double x = 1.0 + 1.0 / (1ull << i);
+    printf("value at %.16f:\n", x);
+    printf("    %.16f\n", IMPL(x));
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/compare.cc b/compiler-rt/test/nsan/compare.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/compare.cc
@@ -0,0 +1,28 @@
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// This test checks that we warn when a floating-point comparison takes
+// different values in the application and shadow domain.
+
+#include <cstdio>
+#include <cmath>
+
+// 0.6/0.2 is slightly below 3, so the comparison will fail after a certain
+// threshold that depends on the precision of the computation.
+__attribute__((noinline))  // To check call stack reporting.
+bool DoCmp(double a, double b, double c, double threshold) {
+  return c - a / b < threshold;
+  // CHECK: WARNING: NumericalStabilitySanitizer: floating-point comparison results depend on precision
+  // CHECK: double    {{ *}}precision dec (native): {{.*}}<{{.*}}
+  // CHECK: __float128{{ *}}precision dec (shadow): {{.*}}<{{.*}}
+  // CHECK: {{#0 .*in DoCmp}}
+}
+
+int main() {
+  double threshold = 1.0;
+  for (int i = 0; i < 60; ++i) {
+    threshold /= 2;
+    printf("value at threshold %.20f: %i\n", threshold, DoCmp(0.6, 0.2, 3.0, threshold));
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/compute_pi.cc b/compiler-rt/test/nsan/compute_pi.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/compute_pi.cc
@@ -0,0 +1,45 @@
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DRECURRENCE=Good %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=10 %run %t
+
+// RUN: %clangxx_nsan -O1 -mllvm -nsan-shadow-type-mapping=dqq -g -DRECURRENCE=Good %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=10 %run %t
+
+// RUN: %clangxx_nsan -O2 -mllvm -nsan-shadow-type-mapping=dqq -g0 -DRECURRENCE=Good %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=10 %run %t
+
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DRECURRENCE=Bad %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=10 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O1 -mllvm -nsan-shadow-type-mapping=dqq -g -DRECURRENCE=Bad %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=10 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -mllvm -nsan-shadow-type-mapping=dqq -g0 -DRECURRENCE=Bad %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=10 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// This is the Archimedes algorithm for computing pi, starting from a hexagon
+// and doubling the number of edges at every iteration.
+// https://en.wikipedia.org/wiki/Floating-point_arithmetic#Minimizing_the_effect_of_accuracy_problems
+
+#include <cstdio>
+#include <cmath>
+
+__attribute__((noinline))  // To check call stack reporting.
+double Bad(double ti) {
+  return (sqrt(ti * ti + 1) - 1) / ti;
+  // CHECK: WARNING: NumericalStabilitySanitizer: inconsistent shadow results
+  // CHECK: double     {{ *}}precision (native):
+  // CHECK: __float128 {{ *}}precision (shadow):
+  // CHECK: {{#0 .*in Bad}}
+}
+
+// This is a better equivalent that does not have the unstable cancellation.
+__attribute__((noinline))  // For consistency.
+double Good(double ti) {
+  return ti / (sqrt(ti * ti + 1) + 1);
+}
+
+int main() {
+  double ti = 1/sqrt(3);  // t0;
+  for (int i = 0; i < 60; ++i) {
+    printf("%2i   pi= %.16f\n", i, 6.0 * (1ull << i) * ti);
+    ti = RECURRENCE(ti);
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/helpers.h b/compiler-rt/test/nsan/helpers.h
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/helpers.h
@@ -0,0 +1,17 @@
+
+// Prevents the compiler from optimizing everything away.
+template <class T>
+void DoNotOptimize(const T& var) {
+  asm volatile("" : "+m"(const_cast<T&>(var)));
+}
+
+// Writes a single double with inconsistent shadow to v.
+void CreateInconsistency(double* data) {
+  double num = 0.6;
+  double denom = 0.2;
+  // Prevent the compiler from constant-folding this.
+  DoNotOptimize(num);
+  DoNotOptimize(denom);
+  // Both values are very close to 0.0, but shadow value is closer.
+  *data = 1.0 / (num/denom - 3.0);
+}
diff --git a/compiler-rt/test/nsan/infinity.cc b/compiler-rt/test/nsan/infinity.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/infinity.cc
@@ -0,0 +1,25 @@
+// This test case verifies that we handle infinity correctly.
+
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t >%t.out 2>&1
+
+
+#include <cstdio>
+#include <limits>
+
+#include "helpers.h"
+
+__attribute__((noinline))  // To check call stack reporting.
+void StoreInf(double* a) {
+  DoNotOptimize(a);
+  double inf = std::numeric_limits<double>::infinity();
+  DoNotOptimize(inf);
+  *a = inf;
+}
+
+int main() {
+  double d;
+  StoreInf(&d);
+  DoNotOptimize(d);
+  printf("%.16f\n", d);
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/intercept_libc_str.cc b/compiler-rt/test/nsan/intercept_libc_str.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/intercept_libc_str.cc
@@ -0,0 +1,149 @@
+// RUN: %clangxx_nsan -O2 -g -DFN=StrFry %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STRFRY %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=StrSep %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STRSEP %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=StrTok %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STRTOK %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=StrDup %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STRDUP %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=StrNDup %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STRNDUP %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=StpCpy %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STPCPY %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=StrCpy %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STRCPY %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=StrNCpy %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STRNCPY %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=StrCat %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STRCAT %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=StrNCat %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=STRNCAT %s < %t.out
+
+// This test case checks libc string operations interception.
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "helpers.h"
+
+extern "C" void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes, size_t bytes_per_line, size_t reserved);
+
+void StrFry(char* const s) {
+  strfry(s);
+  __nsan_dump_shadow_mem(s, sizeof(float), sizeof(float), 0);
+// strfry just destroys the whole area.
+// STRFRY: StrFry
+// STRFRY-NEXT: f0 f1 f2 f3
+// STRFRY-NEXT: __ __ __ f3
+}
+
+void StrSep(char* const s) {
+  char* sc = s;
+  strsep(&sc, "\x40");
+  __nsan_dump_shadow_mem(s, sizeof(float), sizeof(float), 0);
+// strsep destroys the element that was replaced with a null character.
+// STRSEP: StrSep
+// STRSEP-NEXT: f0 f1 f2 f3
+// STRSEP-NEXT: f0 __ f2 f3
+}
+
+void StrTok(char* const s) {
+  strtok(s, "\x40");
+  __nsan_dump_shadow_mem(s, sizeof(float), sizeof(float), 0);
+// strtok just destroys the whole area except the terminator.
+// STRTOK: StrTok
+// STRTOK-NEXT: f0 f1 f2 f3
+// STRTOK-NEXT: __ __ __ f3
+}
+
+void StrDup(char* const s) {
+  char* const dup = strdup(s);
+  __nsan_dump_shadow_mem(dup, 4, 4, 0);
+  free(dup);
+// STRDUP: StrDup
+// STRDUP-NEXT: f0 f1 f2 f3
+// STRDUP-NEXT: f0 f1 f2 __
+}
+
+
+void StrNDup(char* const s) {
+  char* const dup = strndup(s, 2);
+  __nsan_dump_shadow_mem(dup, 3, 3, 0);
+  free(dup);
+// STRNDUP: StrNDup
+// STRNDUP-NEXT: f0 f1 f2 f3
+// STRNDUP-NEXT: f0 f1 __
+}
+
+void StpCpy(char* const s) {
+  char buffer[] = "abcdef\0";
+  stpcpy(buffer, s);
+  __nsan_dump_shadow_mem(buffer, sizeof(buffer), sizeof(buffer), 0);
+// STPCPY: StpCpy
+// STPCPY-NEXT: f0 f1 f2 f3
+// STPCPY-NEXT: f0 f1 f2 __
+}
+
+void StrCpy(char* const s) {
+  char buffer[] = "abcdef\0";
+  strcpy(buffer, s);
+  __nsan_dump_shadow_mem(buffer, sizeof(buffer), sizeof(buffer), 0);
+// STRCPY: StrCpy
+// STRCPY-NEXT: f0 f1 f2 f3
+// STRCPY-NEXT: f0 f1 f2 __
+}
+
+void StrNCpy(char* const s) {
+  char buffer[] = "abcdef\0";
+  strncpy(buffer, s, 2);
+  __nsan_dump_shadow_mem(buffer, sizeof(buffer), sizeof(buffer), 0);
+// STRNCPY: StrNCpy
+// STRNCPY-NEXT: f0 f1 f2 f3
+// STRNCPY-NEXT: f0 f1 __
+}
+
+void StrCat(char* const s) {
+  char buffer[] = "abcd\0    ";
+  strcat(buffer, s);
+  __nsan_dump_shadow_mem(buffer, sizeof(buffer), sizeof(buffer), 0);
+// STRCAT: StrCat
+// STRCAT-NEXT: f0 f1 f2 f3
+// STRCAT-NEXT: __ __ __ __ f0 f1 f2 __
+}
+
+void StrNCat(char* const s) {
+  char buffer[] = "abcd\0    ";
+  strncat(buffer, s, 2);
+  __nsan_dump_shadow_mem(buffer, sizeof(buffer), sizeof(buffer), 0);
+// STRNCAT: StrNCat
+// STRNCAT-NEXT: f0 f1 f2 f3
+// STRNCAT-NEXT: __ __ __ __ f0 f1 __
+}
+
+int main() {
+  // This has binary representation 0x00804020, which in memory (little-endian)
+  // is {0x20,0x40,0x80,0x00}.
+  float f = 1.17779472238e-38f;
+  DoNotOptimize(f);
+  char buffer[sizeof(float)];
+  memcpy(buffer, &f, sizeof(float));
+  printf("{0x%x, 0x%x, 0x%x, 0x%x}\n",
+         (unsigned char)buffer[0], (unsigned char)buffer[1],
+         (unsigned char)buffer[2], (unsigned char)buffer[3]);
+#define str(s) #s
+#define xstr(s) str(s)
+  puts(xstr(FN));
+  __nsan_dump_shadow_mem(buffer, sizeof(float), sizeof(float), 0);
+  FN(buffer);
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/intercept_libc_wstr.cc b/compiler-rt/test/nsan/intercept_libc_wstr.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/intercept_libc_wstr.cc
@@ -0,0 +1,80 @@
+// RUN: %clangxx_nsan -O2 -g -DFN=WcsDup %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=WCSDUP %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=WcpCpy %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=WCPCPY %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=WcsCpy %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=WCSCPY %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DFN=WcsCat %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefix=WCSCAT %s < %t.out
+
+// This test case checks libc wide string operations interception.
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cwchar>
+
+#include "helpers.h"
+
+extern "C" void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes,
+                                       size_t bytes_per_line, size_t reserved);
+
+void WcsDup(wchar_t *const s) {
+  wchar_t *const dup = wcsdup(s);
+  __nsan_dump_shadow_mem(reinterpret_cast<const char *>(dup), 8, 8, 0);
+  free(dup);
+  // WCSDUP: WcsDup
+  // WCSDUP-NEXT: d0 d1 d2 d3 d4 d5 d6 d7
+  // WCSDUP-NEXT: d0 d1 d2 d3 __ __ __ __
+}
+
+void WcpCpy(wchar_t *const s) {
+  wchar_t buffer[] = L"abc\0";
+  wcpcpy(buffer, s);
+  __nsan_dump_shadow_mem(reinterpret_cast<const char *>(buffer), sizeof(buffer),
+                         sizeof(buffer), 0);
+  // WCPCPY: WcpCpy
+  // WCPCPY-NEXT: d0 d1 d2 d3 d4 d5 d6 d7
+  // WCPCPY-NEXT: d0 d1 d2 d3 __ __ __ __
+}
+
+void WcsCpy(wchar_t *const s) {
+  wchar_t buffer[] = L"abc\0";
+  wcscpy(buffer, s);
+  __nsan_dump_shadow_mem(reinterpret_cast<const char *>(buffer), sizeof(buffer),
+                         sizeof(buffer), 0);
+  // WCSCPY: WcsCpy
+  // WCSCPY-NEXT: d0 d1 d2 d3 d4 d5 d6 d7
+  // WCSCPY-NEXT: d0 d1 d2 d3 __ __ __ __
+}
+
+void WcsCat(wchar_t *const s) {
+  wchar_t buffer[] = L"a\0    ";
+  wcscat(buffer, s);
+  __nsan_dump_shadow_mem(reinterpret_cast<const char *>(buffer), sizeof(buffer),
+                         sizeof(buffer), 0);
+  // WCSCAT: WcsCat
+  // WCSCAT-NEXT: d0 d1 d2 d3 d4 d5 d6 d7
+  // WCSCAT-NEXT: __ __ __ __ d0 d1 d2 d3 __ __ __ __
+}
+
+int main() {
+  // This has binary representation 0x0000000080402010, which in memory
+  // (little-endian) is {0x10,0x20,0x40,0x80,0x00,0x00,0x00,0x00}.
+  double f = 1.0630742122880717462525516679E-314;
+  DoNotOptimize(f);
+  wchar_t buffer[sizeof(double) / sizeof(wchar_t)];
+  memcpy(buffer, &f, sizeof(double));
+  static_assert(sizeof(wchar_t) == 4, "not implemented");
+  printf("{0x%x, 0x%x}\n", buffer[0], buffer[1]);
+#define str(s) #s
+#define xstr(s) str(s)
+  puts(xstr(FN));
+  __nsan_dump_shadow_mem(reinterpret_cast<const char *>(buffer), sizeof(double),
+                         sizeof(double), 0);
+  FN(buffer);
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/interface_dump_shadow_mem.cc b/compiler-rt/test/nsan/interface_dump_shadow_mem.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/interface_dump_shadow_mem.cc
@@ -0,0 +1,62 @@
+// RUN: %clangxx_nsan -O2 -g %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -fno-builtin -O2 -g  -mllvm -nsan-shadow-type-mapping=dqq %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+// RUN: %clangxx_nsan -fno-builtin -O2 -g  -mllvm -nsan-shadow-type-mapping=dlq %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// This test checks that the sanitizer interface function
+// `__nsan_dump_shadow_mem` works correctly.
+
+#include <cstring>
+#include <cstdint>
+#include <cstdio>
+
+
+extern "C" void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes, size_t bytes_per_line, size_t reserved);
+
+int main() {
+  char buffer[64];
+  int pos = 0;
+  // One aligned float.
+  const float f = 42.0;
+  memcpy(&(buffer[pos]), &f, sizeof(f));
+  pos += sizeof(f);
+  // One 4-byte aligned double.
+  const double d = 35.0;
+  memcpy(&(buffer[pos]), &d, sizeof(d));
+  pos += sizeof(d);
+  // Three uninitialized bytes.
+  pos += 3;
+  // One char byte.
+  buffer[pos] = 'a';
+  pos += 1;
+  // One long double.
+  const long double l = 0.0000000001;
+  memcpy(&(buffer[pos]), &l, sizeof(l));
+  pos += sizeof(l);
+  // One more double, but erase bytes in the middle.
+  const double d2 = 53.0;
+  memcpy(&(buffer[pos]), &d2, sizeof(d2));
+  pos += sizeof(d2);
+  uint32_t i = 5;
+  memcpy(&(buffer[pos - 5]), &i, sizeof(i));
+  // And finally two consecutive floats.
+  const float f2 = 43.0;
+  memcpy(&(buffer[pos]), &f2, sizeof(f2));
+  pos += sizeof(f2);
+  const float f3 = 44.0;
+  memcpy(&(buffer[pos]), &f3, sizeof(f3));
+
+  __nsan_dump_shadow_mem(buffer, sizeof(buffer), 8, 0);
+// CHECK: 0x{{[a-f0-9]*}}:    f0 f1 f2 f3 d0 d1 d2 d3   (42.00000000000000000000)
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    d4 d5 d6 d7 __ __ __ __   (35.00000000000000000000)
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    l0 l1 l2 l3 l4 l5 l6 l7
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    l8 l9 la lb lc ld le lf   (0.00000000010000000000)
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    d0 d1 d2 f0 f1 f2 f3 d7
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    f0 f1 f2 f3 f0 f1 f2 f3   (43.00000000000000000000)  (44.00000000000000000000)
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    __ __ __ __ __ __ __ __
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    __ __ __ __ __ __ __ __
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/jmmuller.cc b/compiler-rt/test/nsan/jmmuller.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/jmmuller.cc
@@ -0,0 +1,35 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t
+
+// RUN: %clangxx_nsan -O1 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t
+
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t
+
+// This tests J-M Müller's Kahan Challenge:
+// http://arith22.gforge.inria.fr/slides/06-gustafson.pdf
+//
+// The problem is to evaluate `H` at 15, 16, 17, and 9999. The correct
+// answer is (1,1,1,1).
+// Note that in this case, even though the shadow computation in quad mode is
+// also wrong, the inconsistency check shows that there is an issue.
+
+#include <cmath>
+#include <cstdio>
+
+double E(double z) {
+  return z == 0.0 ? 1.0 : (exp(z) - 1.0) / z;
+}
+
+double Q(double x) {
+  return fabs(x - sqrt(x * x + 1)) - 1 / (x + sqrt(x * x + 1));
+}
+
+__attribute__((noinline)) // Do not constant-fold.
+double H(double x) { return E(Q(x * x)); }
+
+int main() {
+  constexpr const double kX[] = {15.0, 16.0, 17.0, 9999.0};
+  printf("(H(%f), H(%f), H(%f), H(%f)) = (%.8f, %.8f, %.8f, %.8f)\n",
+         kX[0], kX[1], kX[2], kX[3],
+         H(kX[0]), H(kX[1]), H(kX[2]), H(kX[3]));
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/lit.cfg.py b/compiler-rt/test/nsan/lit.cfg.py
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/lit.cfg.py
@@ -0,0 +1,45 @@
+# -*- Python -*-
+
+import os
+
+# Setup config name.
+config.name = 'NSan' + config.name_suffix
+
+# Setup source root.
+config.test_source_root = os.path.dirname(__file__)
+
+# Test suffixes.
+config.suffixes = ['.c', '.cc', '.test']
+
+# C & CXX flags.
+c_flags = ([config.target_cflags])
+
+# Android doesn't want -lrt.
+if not config.android:
+  c_flags += ["-lrt"]
+
+cxx_flags = (c_flags + config.cxx_mode_flags + ["-std=c++17"])
+
+nsan_flags = ["-fsanitize=numerical", "-g",
+              "-mno-omit-leaf-frame-pointer",
+              "-fno-omit-frame-pointer"]
+
+def build_invocation(compile_flags):
+  return " " + " ".join([config.clang] + compile_flags) + " "
+
+# Add substitutions.
+config.substitutions.append(("%clang ", build_invocation(c_flags)))
+config.substitutions.append(("%clang_nsan ", build_invocation(c_flags + nsan_flags)))
+config.substitutions.append(("%clangxx_nsan ", build_invocation(cxx_flags + nsan_flags)))
+
+# Platform-specific default NSAN for lit tests.
+default_nsan_options = ''
+
+config.environment['NSAN_OPTIONS'] = default_nsan_options
+default_nsan_options += ':'
+config.substitutions.append(('%env_nsan_options=',
+                             'env NSAN_OPTIONS=' + default_nsan_options))
+
+# NSan tests are currently supported on Linux only.
+if config.host_os not in ['Linux']:
+   config.unsupported = True
diff --git a/compiler-rt/test/nsan/lit.site.cfg.py.in b/compiler-rt/test/nsan/lit.site.cfg.py.in
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/lit.site.cfg.py.in
@@ -0,0 +1,11 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+config.name_suffix = "@NSAN_TEST_CONFIG_SUFFIX@"
+config.target_arch = "@NSAN_TEST_TARGET_ARCH@"
+config.target_cflags = "@NSAN_TEST_TARGET_CFLAGS@"
+
+# Load common config for all compiler-rt lit tests.
+lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
+
+# Load tool-specific config that would do the real work.
+lit_config.load_config(config, "@NSAN_LIT_SOURCE_DIR@/lit.cfg.py")
diff --git a/compiler-rt/test/nsan/memcpy.cc b/compiler-rt/test/nsan/memcpy.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/memcpy.cc
@@ -0,0 +1,83 @@
+// This test case verifies that we can track shadow memory values across
+// explicit or implicit calls to memcpy.
+
+// RUN: %clangxx_nsan -O2 -g -DIMPL=OpEq %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DIMPL=Memcpy %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g -DIMPL=MemcpyInline %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+
+#include <cstdio>
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "helpers.h"
+
+class OpEq {
+ public:
+  double* data() const { return data_.get();}
+
+  OpEq() = default;
+  OpEq(const OpEq& other) {
+    *data_ = *other.data_;
+  }
+
+ private:
+  std::unique_ptr<double> data_ = std::make_unique<double>();
+};
+
+class Memcpy {
+ public:
+  double* data() const { return data_.get();}
+
+  Memcpy() = default;
+  Memcpy(const Memcpy& other) {
+    auto size = sizeof(double);
+    DoNotOptimize(size);  // Prevent the compiler from optimizing this to a load-store.
+    memcpy(data_.get(), other.data_.get(), size);
+  }
+
+ private:
+  std::unique_ptr<double> data_ = std::make_unique<double>();
+};
+
+class MemcpyInline {
+ public:
+  double* data() const { return data_.get();}
+
+  MemcpyInline() = default;
+  MemcpyInline(const MemcpyInline& other) {
+    __builtin_memcpy(data_.get(), other.data_.get(), sizeof(double));
+  }
+
+ private:
+  std::unique_ptr<double> data_ = std::make_unique<double>();
+};
+
+class Vector : public std::vector<double> {
+ public:
+  Vector() : std::vector<double>(1) {}
+};
+
+int main() {
+  using Impl = IMPL;
+  Impl src;
+  CreateInconsistency(src.data());
+  DoNotOptimize(src);
+  // We first verify that an incorrect value has been generated in the original
+  // data location.
+  printf("%.16f\n", *src.data());
+  // CHECK: #0{{.*}}in main{{.*}}memcpy.cc:[[@LINE-1]]
+  Impl dst(src);
+  DoNotOptimize(dst);
+  // This will fail if we correctly carried the shadow value across the copy.
+  printf("%.16f\n", *dst.data());
+  // CHECK: #0{{.*}}in main{{.*}}memcpy.cc:[[@LINE-1]]
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/memset_nonzero.cc b/compiler-rt/test/nsan/memset_nonzero.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/memset_nonzero.cc
@@ -0,0 +1,23 @@
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g %s -o %t && NSAN_OPTIONS=halt_on_error=1,enable_loadtracking_stats=1,print_stats_on_exit=1 %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include "helpers.h"
+
+#include <cstdio>
+#include <cstring>
+
+// This tests tracking of loads where the application value has been set to
+// a non-zero value in a untyped way (e.g. memset).
+// nsan resumes by re-extending the original value, and logs the event to stats.
+// Also see `memset_zero.cc`.
+
+int main() {
+  double* d = new double(2.0);
+  printf("%.16f\n", *d);
+  DoNotOptimize(d);
+  memset(d, 0x55, sizeof(double));
+  DoNotOptimize(d);
+  printf("%.16f\n", *d);
+// CHECK: There were 0/1 floating-point loads where the shadow type was invalid/unknown.
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/memset_zero.cc b/compiler-rt/test/nsan/memset_zero.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/memset_zero.cc
@@ -0,0 +1,24 @@
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g %s -o %t && NSAN_OPTIONS=halt_on_error=1,enable_loadtracking_stats=1,print_stats_on_exit=1 %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include "helpers.h"
+
+#include <cstdio>
+#include <cstring>
+
+// This tests tracking of loads where the application value has been set to zero
+// in a untyped way (e.g. memset).
+// nsan resumes by re-extending the original value, without logging.
+// Also see `memset_nonzero.cc`. Zero is special because application typically
+// initialize large buffers to zero in an untyped way.
+
+int main() {
+  double* d = new double(2.0);
+  printf("%.16f\n", *d);
+  DoNotOptimize(d);
+  memset(d, 0, sizeof(double));
+  DoNotOptimize(d);
+  printf("%.16f\n", *d);
+// CHECK: There were 0/0 floating-point loads where the shadow type was invalid/unknown.
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/rumps_royal_pain.cc b/compiler-rt/test/nsan/rumps_royal_pain.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/rumps_royal_pain.cc
@@ -0,0 +1,37 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O1 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// This tests Rump’s Royal Pain:
+// http://arith22.gforge.inria.fr/slides/06-gustafson.pdf
+//
+// The problem is to evaluate `RumpsRoyalPain(77617, 33096)`. The exact value is
+// –0.82739605994682136. Note that in this case, even though the shadow
+// computation in quad mode is nowhere near the correct value, the inconsistency
+// check shows that there is an issue.
+
+#include <cmath>
+#include <cstdio>
+
+__attribute__((noinline)) // Do not constant-fold.
+double
+RumpsRoyalPain(double x, double y) {
+  return 333.75 * pow(y, 6) +
+         pow(x, 2) *
+             (11 * pow(x, 2) * pow(y, 2) - pow(y, 6) - 121 * pow(y, 4) - 2) +
+         5.5 * pow(y, 8) + x / (2 * y);
+  // CHECK: WARNING: NumericalStabilitySanitizer: inconsistent shadow results while checking return
+  // CHECK: {{#0 .*in RumpsRoyalPain}}
+}
+
+int main() {
+  constexpr const double kX = 77617;
+  constexpr const double kY = 33096;
+  printf("RumpsRoyalPain(%f, %f)=%.8f)\n", kX, kY, RumpsRoyalPain(kX, kY));
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/simd.cc b/compiler-rt/test/nsan/simd.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/simd.cc
@@ -0,0 +1,25 @@
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=0,resume_after_warning=false %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// This tests vector(simd) sanitization.
+
+#include <cstdio>
+#include <smmintrin.h>
+
+#include "helpers.h"
+
+int main() {
+  double in;
+  CreateInconsistency(&in);
+  __m128d v = _mm_set1_pd(in);
+  DoNotOptimize(in);
+  double v2[2];
+  _mm_storeu_pd(v2, v);
+  // CHECK:{{.*}}inconsistent shadow results while checking store to address
+  // CHECK: #0{{.*}}in main{{.*}}[[@LINE-2]]
+  DoNotOptimize(v2);
+  printf("%f\n", v2[0]);
+  // CHECK:{{.*}}inconsistent shadow results while checking call argument #1
+  // CHECK: #0{{.*}}in main{{.*}}[[@LINE-2]]
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/smooth_surprise.cc b/compiler-rt/test/nsan/smooth_surprise.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/smooth_surprise.cc
@@ -0,0 +1,40 @@
+// RUN: %clangxx_nsan -O0 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// RUN: %clangxx_nsan -O1 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 %run %t
+
+// This tests Kahan's Smooth Surprise:
+// http://arith22.gforge.inria.fr/slides/06-gustafson.pdf
+// log(|3(1–x)+1|)/80 + x2 + 1
+//
+// This implementation using floats consistently gives the wrong answer, and
+// this cannot be caught by nsan, because the issue here is not the numerical
+// instability of the computations (`SmoothSurprise` is stable), but the density
+// of the floats.
+
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+
+double SmoothSurprise(double x) {
+  return log(abs(3 * (1 - x) + 1))/80.0 + x * x + 1;
+}
+
+int main() {
+  double x_min = 0.0;
+  double y_min = std::numeric_limits<double>::max();
+  constexpr const double kStart = 0.8;
+  constexpr const double kEnd = 2.0;
+  constexpr const int kNumSteps = 500000;  // Half a million.
+  for (int i = 0; i < kNumSteps; ++i) {
+    const double x = kStart + (i * (kEnd - kStart)) / kNumSteps;
+    const double y = SmoothSurprise(x);
+    if (y < y_min) {
+      x_min = x;
+      y_min = y;
+    }
+  }
+  printf("Minimum at x=%.8f (f(x)=%.8f)\n", x_min, y_min);
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/stable_sort.cc b/compiler-rt/test/nsan/stable_sort.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/stable_sort.cc
@@ -0,0 +1,52 @@
+// RUN: %clangxx_nsan -fno-builtin -O2 -g %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// This tests a particularaly hard case of memory tracking. stable_sort does
+// conditional swaps of pairs of elements with mixed types (int/double).
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdio>
+#include <utility>
+#include <vector>
+
+extern "C" void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes,
+                                       size_t bytes_per_line, size_t reserved);
+
+__attribute__((noinline)) void Run(std::vector<int> &indices,
+                                   std::vector<double> &values) {
+  const auto num_entries = indices.size();
+  std::vector<std::pair<int, double>> entries;
+  entries.reserve(num_entries);
+  for (int i = 0; i < num_entries; ++i) {
+    entries.emplace_back(indices[i], values[i]);
+  }
+  __nsan_dump_shadow_mem((const char *)&entries[0].second,
+                         sizeof(double), sizeof(double), 0);
+  __nsan_dump_shadow_mem((const char *)&entries[1].second,
+                         sizeof(double), sizeof(double), 0);
+  // CHECK: {{.*}}: d0 d1 d2 d3 d4 d5 d6 d7 (1.02800000000000002487)
+  // CHECK-NEXT: {{.*}}: d0 d1 d2 d3 d4 d5 d6 d7 (7.95099999999999962341)
+  std::stable_sort(
+      entries.begin(), entries.end(),
+      [](const std::pair<int, double> &a, const std::pair<int, double> &b) {
+        return a.first < b.first;
+      });
+  __nsan_dump_shadow_mem((const char *)&entries[0].second,
+                         sizeof(double), sizeof(double), 0);
+  __nsan_dump_shadow_mem((const char *)&entries[1].second,
+                         sizeof(double), sizeof(double), 0);
+  // We make sure that the shadow values have been swapped correctly.
+  // CHECK-NEXT: {{.*}}: d0 d1 d2 d3 d4 d5 d6 d7 (7.95099999999999962341)
+  // CHECK-NEXT: {{.*}}: d0 d1 d2 d3 d4 d5 d6 d7 (1.02800000000000002487)
+}
+
+int main() {
+  std::vector<int> indices;
+  std::vector<double> values;
+  indices.push_back(75);
+  values.push_back(1.028);
+  indices.push_back(74);
+  values.push_back(7.951);
+  Run(indices, values);
+}
diff --git a/compiler-rt/test/nsan/stack.cc b/compiler-rt/test/nsan/stack.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/stack.cc
@@ -0,0 +1,18 @@
+// RUN: %clangxx_nsan -O2 -g %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <cstddef>
+
+#include "helpers.h"
+
+extern "C" void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes, size_t bytes_per_line, size_t reserved);
+
+int main() {
+  float array[2];
+  DoNotOptimize(array);
+  array[0] = 1.0;
+  array[1] = 2.0;
+  __nsan_dump_shadow_mem((const char*)array, sizeof(array), 16, 0);
+  // CHECK: {{.*}} f0 f1 f2 f3 f0 f1 f2 f3   (1.00000000000000000000)  (2.00000000000000000000)
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/stats.cc b/compiler-rt/test/nsan/stats.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/stats.cc
@@ -0,0 +1,31 @@
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g %s -o %t && NSAN_OPTIONS=halt_on_error=0,disable_warnings=1,enable_check_stats=1,enable_warning_stats=1,print_stats_on_exit=1 %run %t >%t.out 2>&1
+// Checked separately because the order is not deterministic.
+// RUN: FileCheck %s --check-prefix=WARNING < %t.out
+// RUN: FileCheck %s --check-prefix=NOWARNING < %t.out
+
+// This tests the "stats" mode of nsan.
+// In this test:
+//  - we do not stop the application on error (halt_on_error=0),
+//  - we disable real-time printing of warnings (disable_warnings=1),
+//  - we enable stats collection (enable_{check,warning}_stats=1),
+//  - we print stats when exiting the application (print_stats_on_exit=1).
+// We then check that the application correctly collected stats about the checks
+// that were done and where those checks resulted in warnings.
+
+#include "helpers.h"
+
+#include <cstdio>
+
+int main() {
+  double d;
+  CreateInconsistency(&d);
+  DoNotOptimize(d);
+  printf("%.16f\n", d);
+  // WARNING: warned 1 times out of {{[0-9]*}} argument checks (max relative error:
+  // {{.*}}%) at WARNING-NEXT:#0{{.*}} in main{{.*}}stats.cc:[[@LINE-2]]
+  d = 42;
+  printf("%.16f\n", d);
+  // NOWARNING: warned 0 times out of {{[0-9]*}} argument checks at
+  // NOWARNING-NEXT:#0{{.*}} in main{{.*}}stats.cc:[[@LINE-2]]
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/sums.cc b/compiler-rt/test/nsan/sums.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/sums.cc
@@ -0,0 +1,81 @@
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=float %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefixes=NAIVE,NAIVE-FLOAT < %t.out
+
+// RUN: %clangxx_nsan -O1 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=float %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefixes=NAIVE,NAIVE-FLOAT < %t.out
+
+// RUN: %clangxx_nsan -O2 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=float %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefixes=NAIVE,NAIVE-FLOAT < %t.out
+// RUN: %clangxx_nsan -O2 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=double %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=49 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefixes=NAIVE,NAIVE-DOUBLE-QUAD < %t.out
+// RUN: %clangxx_nsan -O2 -mllvm -nsan-shadow-type-mapping=dlq -g -DSUM=NaiveSum -DFLT=double %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=49 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefixes=NAIVE,NAIVE-DOUBLE-LONG < %t.out
+
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=KahanSum -DFLT=float %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 %run %t
+// RUN: %clangxx_nsan -O3 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=KahanSum -DFLT=float %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 %run %t
+// RUN: %clangxx_nsan -O3 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=KahanSum -DFLT=double %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=49 %run %t
+// RUN: %clangxx_nsan -O3 -mllvm -nsan-shadow-type-mapping=dlq -g -DSUM=KahanSum -DFLT=double %s -o %t && NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=49 %run %t
+
+#include <iostream>
+#include <chrono>
+#include <random>
+#include <vector>
+
+// A naive, unstable summation.
+template <typename T>
+__attribute__((noinline))  // To check call stack reporting.
+T NaiveSum(const std::vector<T>& values) {
+  T sum = 0;
+  for (T v : values) {
+    sum += v;
+  }
+  return sum;
+  // NAIVE: WARNING: NumericalStabilitySanitizer: inconsistent shadow results while checking return
+  // NAIVE-FLOAT: float{{ *}}precision (native):
+  // NAIVE-FLOAT: double{{ *}}precision (shadow):
+  // NAIVE-DOUBLE-QUAD: double {{ *}}precision (native):
+  // NAIVE-DOUBLE-QUAD: __float128{{ *}}precision (shadow):
+  // NAIVE-DOUBLE-LONG: double{{ *}}precision (native):
+  // NAIVE-DOUBLE-LONG: long double{{ *}}precision (shadow):
+  // NAIVE: {{#0 .*in .* NaiveSum}}
+}
+
+// Kahan's summation is a numerically stable sum.
+// https://en.wikipedia.org/wiki/Kahan_summation_algorithm
+template <typename T>
+__attribute__((noinline))  // For consistency.
+T KahanSum(const std::vector<T>& values) {
+  T sum = 0;
+  T c = 0;
+  for (T v : values) {
+    T y = v - c;
+    T t = sum + y;
+    c = (t - sum) - y;
+    sum = t;
+  }
+  return sum;
+}
+
+int main() {
+  std::vector<FLT> values;
+  constexpr const int kNumValues = 1000000;
+  values.reserve(kNumValues);
+  // Using a seed to avoid flakiness.
+  constexpr uint32_t kSeed = 0x123456;
+std::mt19937 gen(kSeed);
+  std::uniform_real_distribution<FLT> dis(0.0f, 1000.0f);
+  for (int i = 0; i < kNumValues; ++i) {
+    values.push_back(dis(gen));
+  }
+
+  const auto t1 = std::chrono::high_resolution_clock::now();
+  const auto sum = SUM(values);
+  const auto t2 = std::chrono::high_resolution_clock::now();
+  printf("sum: %.8f\n", sum);
+  std::cout << "runtime: "
+            << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1)
+                       .count() /
+                   1000.0
+            << "ms\n";
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/suppressions.cc b/compiler-rt/test/nsan/suppressions.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/suppressions.cc
@@ -0,0 +1,26 @@
+// RUN: %clangxx_nsan -O2 -g -DIMPL=OpEq %s -o %t
+
+// RUN: rm -f %t.supp
+// RUN: touch %t.supp
+// RUN: NSAN_OPTIONS="halt_on_error=0,resume_after_warning=false,suppressions='%t.supp'" %run %t 2>&1 | FileCheck %s --check-prefixes=NOSUPP
+
+// RUN: echo "consistency:*main*" > %t.supp
+// RUN: NSAN_OPTIONS="halt_on_error=0,resume_after_warning=false,suppressions='%t.supp'" %run %t 2>&1 | FileCheck %s --check-prefixes=SUPP
+
+// This tests sanitizer suppressions, i.e. warning silencing.
+
+#include "helpers.h"
+
+#include <cstdio>
+
+int main() {
+  double d;
+  CreateInconsistency(&d);
+  // NOSUPP: #1{{.*}}[[@LINE-1]]
+  // SUPP-NOT: #1{{.*}}[[@LINE-2]]
+  DoNotOptimize(d);
+  printf("%.16f\n", d);
+  // NOSUPP: #0{{.*}}[[@LINE-1]]
+  // SUPP-NOT: #0[[@LINE-2]]
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/swap.cc b/compiler-rt/test/nsan/swap.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/swap.cc
@@ -0,0 +1,44 @@
+// RUN: %clangxx_nsan -fno-builtin -mllvm -nsan-check-loads -O2 -g2 -UNDEBUG %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// This verifies that shadow memory is tracked correcty across typed and
+// bitcasted swaps.
+
+#include <cstddef>
+#include <cstdint>
+#include <cassert>
+#include <utility>
+
+extern "C" void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes, size_t bytes_per_line, size_t reserved);
+
+__attribute__((noinline))  // prevent optimization
+void SwapFT(double *a, double* b) {
+  // LLVM typically optimizes this to an untyped swap (through i64) anyway.
+  std::swap(*a, *b);
+}
+
+__attribute__((noinline))  // prevent optimization
+void SwapBitcasted(uint64_t *a, uint64_t*b) {
+  std::swap(*a, *b);
+}
+
+int main() {
+  double a = 1.0, b = 2.0;
+  __nsan_dump_shadow_mem((const char*)&a, sizeof(a), sizeof(a), 0);
+  __nsan_dump_shadow_mem((const char*)&b, sizeof(b), sizeof(b), 0);
+  SwapFT(&a, &b);
+  __nsan_dump_shadow_mem((const char*)&a, sizeof(a), sizeof(a), 0);
+  __nsan_dump_shadow_mem((const char*)&b, sizeof(b), sizeof(b), 0);
+  assert(a == 2.0 && b == 1.0);
+  // This breaks strict aliasing but is OK on X86.
+  SwapBitcasted(reinterpret_cast<uint64_t*>(&a), reinterpret_cast<uint64_t*>(&b));
+  __nsan_dump_shadow_mem((const char*)&a, sizeof(a), sizeof(a), 0);
+  __nsan_dump_shadow_mem((const char*)&b, sizeof(b), sizeof(b), 0);
+  assert(a == 1.0 && b == 2.0);
+// CHECK: 0x{{[a-f0-9]*}}:    d0 d1 d2 d3 d4 d5 d6 d7   (1.0{{.*}}
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    d0 d1 d2 d3 d4 d5 d6 d7   (2.0{{.*}}
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    d0 d1 d2 d3 d4 d5 d6 d7   (2.0{{.*}}
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    d0 d1 d2 d3 d4 d5 d6 d7   (1.0{{.*}}
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    d0 d1 d2 d3 d4 d5 d6 d7   (1.0{{.*}}
+// CHECK-NEXT: 0x{{[a-f0-9]*}}:    d0 d1 d2 d3 d4 d5 d6 d7   (2.0{{.*}}
+}
diff --git a/compiler-rt/test/nsan/type_punning.cc b/compiler-rt/test/nsan/type_punning.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/type_punning.cc
@@ -0,0 +1,26 @@
+// RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g %s -o %t && NSAN_OPTIONS=halt_on_error=1,enable_loadtracking_stats=1,print_stats_on_exit=1 %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include "helpers.h"
+
+#include <cstdio>
+#include <cstring>
+#include <memory>
+
+// This tests tracking of loads where the application value has been tampered
+// with through type punning.
+// nsan resumes by re-extending the original value, and logs the failed tracking
+// to stats.
+
+int main() {
+  auto d = std::make_unique<double>(2.0);
+  printf("%.16f\n", *d);
+  DoNotOptimize(d);
+  reinterpret_cast<char *>(d.get())[7] = 0;
+  DoNotOptimize(d);
+  printf("%.16f\n", *d);
+  // CHECK: invalid/unknown type for 1/0 loads
+  // CHECK: There were 1/0 floating-point loads where the shadow type was invalid/unknown
+  // or unknown.
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/uninstrumented_write.cc b/compiler-rt/test/nsan/uninstrumented_write.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/uninstrumented_write.cc
@@ -0,0 +1,22 @@
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=0 %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// This test load checking. Inconsistencies on load can happen when
+// uninstrumented code writes to memory.
+
+#include "helpers.h"
+
+#include <cstdio>
+#include <memory>
+
+int main() {
+  auto d = std::make_unique<double>(2.0);
+  printf("%.16f\n", *d);
+  DoNotOptimize(d);
+  // Sneakily change the sign bit.
+  asm volatile("xorb $0x80, 7(%0)" : : "r"(d.get()));
+  printf("%.16f\n", *d);
+  // CHECK: WARNING: NumericalStabilitySanitizer: inconsistent shadow results
+  // while checking call argument #1 CHECK: {{#0 .*in main}}
+  return 0;
+}
diff --git a/compiler-rt/test/nsan/vector_push_back.cc b/compiler-rt/test/nsan/vector_push_back.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/vector_push_back.cc
@@ -0,0 +1,17 @@
+// RUN: %clangxx_nsan -fno-builtin -O2 -g0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// This test verifies that dynamic memory is correctly tracked.
+
+#include <cstddef>
+#include <vector>
+
+extern "C" void __nsan_dump_shadow_mem(const char *addr, size_t size_bytes, size_t bytes_per_line, size_t reserved);
+
+int main() {
+  std::vector<double> values;
+  values.push_back(1.028);
+  __nsan_dump_shadow_mem((const char*)values.data(), 8, 8, 0);
+  // CHECK: 0x{{[a-f0-9]*}}:    d0 d1 d2 d3 d4 d5 d6 d7   (1.02800000000000002487)
+}
+
diff --git a/compiler-rt/test/nsan/verificarlo_case4.cc b/compiler-rt/test/nsan/verificarlo_case4.cc
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/nsan/verificarlo_case4.cc
@@ -0,0 +1,28 @@
+// RUN: %clangxx_nsan -O2 -g %s -o %t && NSAN_OPTIONS=halt_on_error=1 not %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// Case Study #4 from the Verificarlo paper: The loop alternates between
+// accumulating extremely large and extremely small values, leading to large
+// loss of precision.
+
+#include <cstdio>
+
+using FloatT = double;
+
+__attribute__((noinline)) FloatT Case4(FloatT c, int iterations) {
+  for (unsigned i = 0; i < iterations; ++i) {
+    if (i % 2 == 0)
+      c = c + 1.e6;
+    else
+      c = c - 1.e-6;
+  }
+  return c;
+  // CHECK: #0 {{.*}} in Case4{{.*}}[[@LINE-1]]
+}
+
+int main() {
+  for (int iterations = 1; iterations <= 100000000; iterations *= 10) {
+    printf("%10i iterations: %f\n", iterations, Case4(-5.e13, iterations));
+  }
+  return 0;
+}
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -658,6 +658,7 @@
   ATTR_KIND_NO_CALLBACK = 71,
   ATTR_KIND_HOT = 72,
   ATTR_KIND_NO_PROFILE = 73,
+  ATTR_KIND_SANITIZE_NUMERICAL_STABILITY = 74,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -230,6 +230,9 @@
 /// MemTagSanitizer is on.
 def SanitizeMemTag : EnumAttr<"sanitize_memtag">;
 
+/// NumericalStabilitySanitizer is on.
+def SanitizeNumericalStability : EnumAttr<"sanitize_numericalstability">;
+
 /// Speculative Load Hardening is enabled.
 ///
 /// Note that this uses the default compatibility (always compatible during
@@ -285,6 +288,7 @@
 def : CompatRule<"isEqual<SanitizeMemoryAttr>">;
 def : CompatRule<"isEqual<SanitizeHWAddressAttr>">;
 def : CompatRule<"isEqual<SanitizeMemTagAttr>">;
+def : CompatRule<"isEqual<SanitizeNumericalStabilityAttr>">;
 def : CompatRule<"isEqual<SafeStackAttr>">;
 def : CompatRule<"isEqual<ShadowCallStackAttr>">;
 def : CompatRule<"isEqual<UseSampleProfileAttr>">;
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -432,6 +432,7 @@
 void initializeTargetPassConfigPass(PassRegistry&);
 void initializeTargetTransformInfoWrapperPassPass(PassRegistry&);
 void initializeThreadSanitizerLegacyPassPass(PassRegistry&);
+void initializeNumericalStabilitySanitizerLegacyPassPass(PassRegistry&);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAAWrapperPassPass(PassRegistry&);
 void initializeTypePromotionPass(PassRegistry&);
diff --git a/llvm/include/llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h
@@ -0,0 +1,40 @@
+//===- NumericalStabilitySanitizer.h - NSan Pass ---------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the numerical stability sanitizer (nsan) pass.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_NUMERICALSTABIITYSANITIZER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_NUMERICALSTABIITYSANITIZER_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+/// Inserts NumericalStabilitySanitizer instrumentation.
+FunctionPass *createNumericalStabilitySanitizerLegacyPassPass();
+
+/// A function pass for nsan instrumentation.
+///
+/// Instruments functions to duplicate floating point computations in a
+/// higher-precision type.
+/// This function pass inserts calls to runtime library functions. If the
+/// functions aren't declared yet, the pass inserts the declarations.
+struct NumericalStabilitySanitizerPass
+    : public PassInfoMixin<NumericalStabilitySanitizerPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_INSTRUMENTATION_NUMERICALSTABIITYSANITIZER_H
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -691,6 +691,7 @@
   KEYWORD(sanitize_address);
   KEYWORD(sanitize_hwaddress);
   KEYWORD(sanitize_memtag);
+  KEYWORD(sanitize_numericalstability);
   KEYWORD(sanitize_thread);
   KEYWORD(sanitize_memory);
   KEYWORD(speculative_load_hardening);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -1395,6 +1395,8 @@
       B.addAttribute(Attribute::SanitizeHWAddress); break;
     case lltok::kw_sanitize_memtag:
       B.addAttribute(Attribute::SanitizeMemTag); break;
+    case lltok::kw_sanitize_numericalstability:
+      B.addAttribute(Attribute::SanitizeNumericalStability); break;
     case lltok::kw_sanitize_thread:
       B.addAttribute(Attribute::SanitizeThread); break;
     case lltok::kw_sanitize_memory:
@@ -1791,6 +1793,7 @@
     case lltok::kw_sanitize_address:
     case lltok::kw_sanitize_hwaddress:
     case lltok::kw_sanitize_memtag:
+    case lltok::kw_sanitize_numericalstability:
     case lltok::kw_sanitize_memory:
     case lltok::kw_sanitize_thread:
     case lltok::kw_speculative_load_hardening:
@@ -1900,6 +1903,7 @@
     case lltok::kw_sanitize_address:
     case lltok::kw_sanitize_hwaddress:
     case lltok::kw_sanitize_memtag:
+    case lltok::kw_sanitize_numericalstability:
     case lltok::kw_sanitize_memory:
     case lltok::kw_sanitize_thread:
     case lltok::kw_speculative_load_hardening:
diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h
--- a/llvm/lib/AsmParser/LLToken.h
+++ b/llvm/lib/AsmParser/LLToken.h
@@ -182,6 +182,7 @@
   kw_sanitize_address,
   kw_sanitize_hwaddress,
   kw_sanitize_memtag,
+  kw_sanitize_numericalstability,
   kw_builtin,
   kw_byval,
   kw_inalloca,
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1546,6 +1546,8 @@
     return Attribute::MustProgress;
   case bitc::ATTR_KIND_HOT:
     return Attribute::Hot;
+  case bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY:
+    return Attribute::SanitizeNumericalStability;
   }
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -754,6 +754,8 @@
     return bitc::ATTR_KIND_BYREF;
   case Attribute::MustProgress:
     return bitc::ATTR_KIND_MUSTPROGRESS;
+  case Attribute::SanitizeNumericalStability:
+    return bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY;
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -337,6 +337,8 @@
     return "sanitize_hwaddress";
   if (hasAttribute(Attribute::SanitizeMemTag))
     return "sanitize_memtag";
+  if (hasAttribute(Attribute::SanitizeNumericalStability))
+    return "sanitize_numericalstability";
   if (hasAttribute(Attribute::AlwaysInline))
     return "alwaysinline";
   if (hasAttribute(Attribute::ArgMemOnly))
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1629,6 +1629,7 @@
   case Attribute::SanitizeMemTag:
   case Attribute::SanitizeThread:
   case Attribute::SanitizeMemory:
+  case Attribute::SanitizeNumericalStability:
   case Attribute::MinSize:
   case Attribute::NoDuplicate:
   case Attribute::Builtin:
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -131,6 +131,7 @@
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Instrumentation/PoisonChecking.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -114,6 +114,7 @@
 MODULE_PASS("dfsan", DataFlowSanitizerPass())
 MODULE_PASS("asan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/false, false, true, false))
 MODULE_PASS("msan-module", MemorySanitizerPass({}))
+MODULE_PASS("nsan-module", NumericalStabilitySanitizerPass())
 MODULE_PASS("tsan-module", ThreadSanitizerPass())
 MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false))
 MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass())
@@ -328,6 +329,7 @@
 FUNCTION_PASS("kasan", AddressSanitizerPass(true, false, false))
 FUNCTION_PASS("msan", MemorySanitizerPass({}))
 FUNCTION_PASS("kmsan", MemorySanitizerPass({0, false, /*Kernel=*/true}))
+FUNCTION_PASS("nsan", NumericalStabilitySanitizerPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("memprof", MemProfilerPass())
 #undef FUNCTION_PASS
diff --git a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
--- a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -65,6 +65,7 @@
       .Case("sanitize_address", Attribute::SanitizeAddress)
       .Case("sanitize_hwaddress", Attribute::SanitizeHWAddress)
       .Case("sanitize_memory", Attribute::SanitizeMemory)
+      .Case("sanitize_numericalstability", Attribute::SanitizeNumericalStability)
       .Case("sanitize_thread", Attribute::SanitizeThread)
       .Case("sanitize_memtag", Attribute::SanitizeMemTag)
       .Case("speculative_load_hardening", Attribute::SpeculativeLoadHardening)
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -7,6 +7,7 @@
   GCOVProfiling.cpp
   MemProfiler.cpp
   MemorySanitizer.cpp
+  NumericalStabilitySanitizer.cpp
   IndirectCallPromotion.cpp
   Instrumentation.cpp
   InstrOrderFile.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
--- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -108,6 +108,7 @@
   initializeMemorySanitizerLegacyPassPass(Registry);
   initializeHWAddressSanitizerLegacyPassPass(Registry);
   initializeThreadSanitizerLegacyPassPass(Registry);
+  initializeNumericalStabilitySanitizerLegacyPassPass(Registry);
   initializeModuleSanitizerCoverageLegacyPassPass(Registry);
   initializeDataFlowSanitizerLegacyPassPass(Registry);
 }
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -0,0 +1,2270 @@
+//===-- NumericalStabilitySanitizer.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of NumericalStabilitySanitizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h"
+
+#include <cstdint>
+#include <unordered_map>
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nsan"
+
+STATISTIC(NumInstrumentedFTLoads,
+          "Number of instrumented floating-point loads");
+
+STATISTIC(NumInstrumentedFTCalls,
+          "Number of instrumented floating-point calls");
+STATISTIC(NumInstrumentedFTRets,
+          "Number of instrumented floating-point returns");
+STATISTIC(NumInstrumentedFTStores,
+          "Number of instrumented floating-point stores");
+STATISTIC(NumInstrumentedNonFTStores,
+          "Number of instrumented non floating-point stores");
+STATISTIC(
+    NumInstrumentedNonFTMemcpyStores,
+    "Number of instrumented non floating-point stores with memcpy semantics");
+STATISTIC(NumInstrumentedFCmp, "Number of instrumented fcmps");
+
+// Using smaller shadow types types can help improve speed. For example, `dlq`
+// is 3x slower to 5x faster in opt mode and 2-6x faster in dbg mode compared to
+// `dqq`.
+static cl::opt<std::string> ClShadowMapping(
+    "nsan-shadow-type-mapping", cl::init("dqq"),
+    cl::desc("One shadow type id for each of `float`, `double`, `long double`. "
+             "`d`,`l`,`q`,`e` mean double, x86_fp80, fp128 (quad) and "
+             "ppc_fp128 (extended double) respectively. The default is to "
+             "shadow `float` as `double`, and `double` and `x86_fp80` as "
+             "`fp128`"),
+    cl::Hidden);
+
+static cl::opt<bool>
+    ClInstrumentFCmp("nsan-instrument-fcmp", cl::init(true),
+                     cl::desc("Instrument floating-point comparisons"),
+                     cl::Hidden);
+
+static cl::opt<bool> ClTruncateFCmpEq(
+    "nsan-truncate-fcmp-eq", cl::init(true),
+    cl::desc(
+        "This flag controls the behaviour of fcmp equality comparisons:"
+        "For equality comparisons such as `x == 0.0f`, we can perform the "
+        "shadow check in the shadow (`x_shadow == 0.0) == (x == 0.0f)`) or app "
+        " domain (`(trunc(x_shadow) == 0.0f) == (x == 0.0f)`). This helps "
+        "catch the case when `x_shadow` is accurate enough (and therefore "
+        "close enough to zero) so that `trunc(x_shadow)` is zero even though "
+        "both `x` and `x_shadow` are not. "),
+    cl::Hidden);
+
+// When there is external, uninstrumented code writing to memory, the shadow
+// memory can get out of sync with the application memory. Enabling this flag
+// emits consistency checks for loads to catch this situation.
+// When everything is instrumented, this is not strictly necessary because any
+// load should have a corresponding store, but can help debug cases when the
+// framework did a bad job at tracking shadow memory modifications by failing on
+// load rather than store.
+// FIXME: provide a way to resume computations from the FT value when the load
+// is inconsistent. This ensures that further computations are not polluted.
+static cl::opt<bool> ClCheckLoads("nsan-check-loads", cl::init(false),
+                                  cl::desc("Check floating-point load"),
+                                  cl::Hidden);
+
+static const char *const kNsanModuleCtorName = "nsan.module_ctor";
+static const char *const kNsanInitName = "__nsan_init";
+
+// The following values must be kept in sync with the runtime.
+static constexpr const int kShadowScale = 2;
+static constexpr const int kMaxVectorWidth = 8;
+static constexpr const int kMaxNumArgs = 128;
+static constexpr const int kMaxShadowTypeSizeBytes = 16; // fp128
+
+namespace {
+
+// Defines the characteristics (type id, type, and floating-point semantics)
+// attached for all possible shadow types.
+class ShadowTypeConfig {
+public:
+  static std::unique_ptr<ShadowTypeConfig> fromNsanTypeId(char TypeId);
+  // The floating-point semantics of the shadow type.
+  virtual const fltSemantics &semantics() const = 0;
+
+  // The LLVM Type corresponding to the shadow type.
+  virtual Type *getType(LLVMContext &Context) const = 0;
+
+  // The nsan type id of the shadow type (`d`, `l`, `q`, ...).
+  virtual char getNsanTypeId() const = 0;
+
+  virtual ~ShadowTypeConfig() {}
+};
+
+template <char NsanTypeId>
+class ShadowTypeConfigImpl : public ShadowTypeConfig {
+public:
+  char getNsanTypeId() const override { return NsanTypeId; }
+  static constexpr const char kNsanTypeId = NsanTypeId;
+};
+
+// `double` (`d`) shadow type.
+class F64ShadowConfig : public ShadowTypeConfigImpl<'d'> {
+  const fltSemantics &semantics() const override {
+    return APFloat::IEEEdouble();
+  }
+  Type *getType(LLVMContext &Context) const override {
+    return Type::getDoubleTy(Context);
+  }
+};
+
+// `x86_fp80` (`l`) shadow type: X86 long double.
+class F80ShadowConfig : public ShadowTypeConfigImpl<'l'> {
+  const fltSemantics &semantics() const override {
+    return APFloat::x87DoubleExtended();
+  }
+  Type *getType(LLVMContext &Context) const override {
+    return Type::getX86_FP80Ty(Context);
+  }
+};
+
+// `fp128` (`q`) shadow type.
+class F128ShadowConfig : public ShadowTypeConfigImpl<'q'> {
+  const fltSemantics &semantics() const override { return APFloat::IEEEquad(); }
+  Type *getType(LLVMContext &Context) const override {
+    return Type::getFP128Ty(Context);
+  }
+};
+
+// `ppc_fp128` (`e`) shadow type: IBM extended double with 106 bits of mantissa.
+class PPC128ShadowConfig : public ShadowTypeConfigImpl<'e'> {
+  const fltSemantics &semantics() const override {
+    return APFloat::PPCDoubleDouble();
+  }
+  Type *getType(LLVMContext &Context) const override {
+    return Type::getPPC_FP128Ty(Context);
+  }
+};
+
+// Creates a ShadowTypeConfig given its type id.
+std::unique_ptr<ShadowTypeConfig>
+ShadowTypeConfig::fromNsanTypeId(const char TypeId) {
+  switch (TypeId) {
+  case F64ShadowConfig::kNsanTypeId:
+    return std::make_unique<F64ShadowConfig>();
+  case F80ShadowConfig::kNsanTypeId:
+    return std::make_unique<F80ShadowConfig>();
+  case F128ShadowConfig::kNsanTypeId:
+    return std::make_unique<F128ShadowConfig>();
+  case PPC128ShadowConfig::kNsanTypeId:
+    return std::make_unique<PPC128ShadowConfig>();
+  }
+  errs() << "nsan: invalid shadow type id'" << TypeId << "'\n";
+  return nullptr;
+}
+
+// An enum corresponding to shadow value types. Used as indices in arrays, so
+// not an `enum class`.
+enum FTValueType { kFloat, kDouble, kLongDouble, kNumValueTypes };
+
+static FTValueType semanticsToFTValueType(const fltSemantics &Sem) {
+  if (&Sem == &APFloat::IEEEsingle()) {
+    return kFloat;
+  } else if (&Sem == &APFloat::IEEEdouble()) {
+    return kDouble;
+  } else if (&Sem == &APFloat::x87DoubleExtended()) {
+    return kLongDouble;
+  }
+  llvm_unreachable("semantics are not one of the handled types");
+}
+
+// If `FT` corresponds to a primitive FTValueType, return it.
+static Optional<FTValueType> ftValueTypeFromType(Type *FT) {
+  if (FT->isFloatTy())
+    return kFloat;
+  if (FT->isDoubleTy())
+    return kDouble;
+  if (FT->isX86_FP80Ty())
+    return kLongDouble;
+  return {};
+}
+
+// Returns the LLVM type for an FTValueType.
+static Type *typeFromFTValueType(FTValueType VT, LLVMContext &Context) {
+  switch (VT) {
+  case kFloat:
+    return Type::getFloatTy(Context);
+  case kDouble:
+    return Type::getDoubleTy(Context);
+  case kLongDouble:
+    return Type::getX86_FP80Ty(Context);
+  case kNumValueTypes:
+    return nullptr;
+  }
+}
+
+// Returns the type name for an FTValueType.
+static const char *typeNameFromFTValueType(FTValueType VT) {
+  switch (VT) {
+  case kFloat:
+    return "float";
+  case kDouble:
+    return "double";
+  case kLongDouble:
+    return "longdouble";
+  case kNumValueTypes:
+    return nullptr;
+  }
+}
+
+// A specific mapping configuration of application type to shadow type for nsan
+// (see -nsan-shadow-mapping flag).
+class MappingConfig {
+public:
+  bool initialize(LLVMContext *C) {
+    if (ClShadowMapping.size() != 3) {
+      errs() << "Invalid nsan mapping: " << ClShadowMapping << "\n";
+    }
+    Context = C;
+    unsigned ShadowTypeSizeBits[kNumValueTypes];
+    for (int VT = 0; VT < kNumValueTypes; ++VT) {
+      auto Config = ShadowTypeConfig::fromNsanTypeId(ClShadowMapping[VT]);
+      if (Config == nullptr)
+        return false;
+      const unsigned AppTypeSize =
+          typeFromFTValueType(static_cast<FTValueType>(VT), *C)
+              ->getScalarSizeInBits();
+      const unsigned ShadowTypeSize =
+          Config->getType(*C)->getScalarSizeInBits();
+      // Check that the shadow type size is at most kShadowScale times the
+      // application type size, so that shadow memory compoutations are valid.
+      if (ShadowTypeSize > kShadowScale * AppTypeSize) {
+        errs() << "Invalid nsan mapping f" << AppTypeSize << "->f"
+               << ShadowTypeSize << ": The shadow type size should be at most "
+               << kShadowScale << " times the application type size\n";
+        return false;
+      }
+      ShadowTypeSizeBits[VT] = ShadowTypeSize;
+      Configs[VT] = std::move(Config);
+    }
+
+    // Check that the mapping is monotonous. This is required because if one
+    // does an fpextend of `float->long double` in application code, nsan is
+    // going to do an fpextend of `shadow(float) -> shadow(long double)` in
+    // shadow code. This will fail in `qql` mode, since nsan would be
+    // fpextending `f128->long`, which is invalid.
+    // FIXME: Relax this.
+    if (ShadowTypeSizeBits[kFloat] > ShadowTypeSizeBits[kDouble] ||
+        ShadowTypeSizeBits[kDouble] > ShadowTypeSizeBits[kLongDouble]) {
+      errs() << "Invalid nsan mapping: { float->f" << ShadowTypeSizeBits[kFloat]
+             << "; double->f" << ShadowTypeSizeBits[kDouble]
+             << "; long double->f" << ShadowTypeSizeBits[kLongDouble] << " }\n";
+      return false;
+    }
+    return true;
+  }
+
+  const ShadowTypeConfig &byValueType(FTValueType VT) const {
+    assert(VT < FTValueType::kNumValueTypes && "invalid value type");
+    return *Configs[VT];
+  }
+
+  const ShadowTypeConfig &bySemantics(const fltSemantics &Sem) const {
+    return byValueType(semanticsToFTValueType(Sem));
+  }
+
+  // Returns the extended shadow type for a given application type.
+  Type *getExtendedFPType(Type *FT) const {
+    if (const auto VT = ftValueTypeFromType(FT))
+      return Configs[*VT]->getType(*Context);
+    if (FT->isVectorTy()) {
+      auto *VecTy = cast<VectorType>(FT);
+      Type *ExtendedScalar = getExtendedFPType(VecTy->getElementType());
+      return ExtendedScalar
+                 ? VectorType::get(ExtendedScalar, VecTy->getElementCount())
+                 : nullptr;
+    }
+    return nullptr;
+  }
+
+private:
+  LLVMContext *Context = nullptr;
+  std::unique_ptr<ShadowTypeConfig> Configs[FTValueType::kNumValueTypes];
+};
+
+// The memory extents of a type specifies how many elements of a given
+// FTValueType needs to be stored when storing this type.
+struct MemoryExtents {
+  FTValueType ValueType;
+  uint64_t NumElts;
+};
+static MemoryExtents getMemoryExtentsOrDie(Type *FT) {
+  if (const auto VT = ftValueTypeFromType(FT))
+    return {*VT, 1};
+  if (FT->isVectorTy()) {
+    auto *VecTy = cast<VectorType>(FT);
+    const auto ScalarExtents = getMemoryExtentsOrDie(VecTy->getElementType());
+    return {ScalarExtents.ValueType,
+            ScalarExtents.NumElts * VecTy->getElementCount().getFixedValue()};
+  }
+  llvm_unreachable("invalid value type");
+}
+
+// The location of a check. Passed as parameters to runtime checking functions.
+class CheckLoc {
+public:
+  // Creates a location that references an application memory location.
+  static CheckLoc makeStore(Value *Address) {
+    CheckLoc Result(kStore);
+    Result.Address = Address;
+    return Result;
+  }
+  static CheckLoc makeLoad(Value *Address) {
+    CheckLoc Result(kLoad);
+    Result.Address = Address;
+    return Result;
+  }
+
+  // Creates a location that references an argument, given by id.
+  static CheckLoc makeArg(int ArgId) {
+    CheckLoc Result(kArg);
+    Result.ArgId = ArgId;
+    return Result;
+  }
+
+  // Creates a location that references the return value of a function.
+  static CheckLoc makeRet() { return CheckLoc(kRet); }
+
+  // Creates a location that references a vector insert.
+  static CheckLoc makeInsert() { return CheckLoc(kInsert); }
+
+  // Returns the CheckType of location this refers to, as an integer-typed LLVM
+  // IR value.
+  Value *getType(LLVMContext &C) const {
+    return ConstantInt::get(Type::getInt32Ty(C), static_cast<int>(CheckTy));
+  }
+
+  // Returns a CheckType-specific value representing details of the location
+  // (e.g. application address for loads or stores), as an `IntptrTy`-typed LLVM
+  // IR value.
+  Value *getValue(Type *IntptrTy, IRBuilder<> &Builder) const {
+    switch (CheckTy) {
+    case kUnknown:
+      llvm_unreachable("unknown type");
+    case kRet:
+    case kInsert:
+      return ConstantInt::get(IntptrTy, 0);
+    case kArg:
+      return ConstantInt::get(IntptrTy, ArgId);
+    case kLoad:
+    case kStore:
+      return Builder.CreatePtrToInt(Address, IntptrTy);
+    }
+  }
+
+private:
+  // Must be kept in sync with the runtime.
+  enum CheckType {
+    kUnknown = 0,
+    kRet,
+    kArg,
+    kLoad,
+    kStore,
+    kInsert,
+  };
+  explicit CheckLoc(CheckType CheckTy) : CheckTy(CheckTy) {}
+
+  const CheckType CheckTy;
+  Value *Address = nullptr;
+  int ArgId = -1;
+};
+
+// A map of LLVM IR values to shadow LLVM IR values.
+class ValueToShadowMap {
+public:
+  explicit ValueToShadowMap(MappingConfig *Config) : Config(Config) {}
+
+  // Sets the shadow value for a value. Asserts that the value does not already
+  // have a value.
+  void setShadow(Value *V, Value *Shadow) {
+    assert(V);
+    assert(Shadow);
+    const bool Inserted = Map.emplace(V, Shadow).second;
+#ifdef LLVM_ENABLE_DUMP
+    if (!Inserted) {
+      if (const auto *const I = dyn_cast<Instruction>(V))
+        I->getParent()->getParent()->dump();
+      errs() << "duplicate shadow (" << V << "): ";
+      V->dump();
+    }
+#endif
+    assert(Inserted && "duplicate shadow");
+    (void)Inserted;
+  }
+
+  // Returns true if the value already has a shadow (including if the value is a
+  // constant). If true, calling getShadow() is valid.
+  bool hasShadow(Value *V) const {
+    return isa<Constant>(V) || (Map.find(V) != Map.end());
+  }
+
+  // Returns the shadow value for a given value. Asserts that the value has
+  // a shadow value. Lazily creates shadows for constant values.
+  Value *getShadow(Value *V) const {
+    assert(V);
+    if (Constant *C = dyn_cast<Constant>(V))
+      return getShadowConstant(C);
+    const auto ShadowValIt = Map.find(V);
+    assert(ShadowValIt != Map.end() && "shadow val does not exist");
+    assert(ShadowValIt->second && "shadow val is null");
+    return ShadowValIt->second;
+  }
+
+  bool empty() const { return Map.empty(); }
+
+private:
+  // Extends a constant application value to its shadow counterpart.
+  APFloat extendConstantFP(APFloat CV) const {
+    bool LosesInfo = false;
+    CV.convert(Config->bySemantics(CV.getSemantics()).semantics(),
+               APFloatBase::rmTowardZero, &LosesInfo);
+    return CV;
+  }
+
+  // Returns the shadow constant for the given application constant.
+  Constant *getShadowConstant(Constant *C) const {
+    if (UndefValue *U = dyn_cast<UndefValue>(C)) {
+      return UndefValue::get(Config->getExtendedFPType(U->getType()));
+    }
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+      // Floating-point constants.
+      return ConstantFP::get(Config->getExtendedFPType(CFP->getType()),
+                             extendConstantFP(CFP->getValueAPF()));
+    }
+    // Vector, array, or aggregate constants.
+    if (C->getType()->isVectorTy()) {
+      SmallVector<Constant *, 8> Elements;
+      for (int I = 0, E = cast<VectorType>(C->getType())->getElementCount().getFixedValue();
+           I < E; ++I)
+        Elements.push_back(getShadowConstant(C->getAggregateElement(I)));
+      return ConstantVector::get(Elements);
+    }
+    llvm_unreachable("unimplemented");
+  }
+
+  MappingConfig *const Config;
+  std::unordered_map<Value *, Value *> Map;
+};
+
+/// Instantiating NumericalStabilitySanitizer inserts the nsan runtime library
+/// API function declarations into the module if they don't exist already.
+/// Instantiating ensures the __nsan_init function is in the list of global
+/// constructors for the module.
+class NumericalStabilitySanitizer {
+public:
+  bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
+
+private:
+  void initialize(Module &M);
+  bool instrumentMemIntrinsic(MemIntrinsic *MI);
+  void maybeAddSuffixForNsanInterface(CallBase *CI);
+  bool addrPointsToConstantData(Value *Addr);
+  void maybeCreateShadowValue(Instruction &Root, const TargetLibraryInfo &TLI,
+                              ValueToShadowMap &Map);
+  Value *createShadowValueWithOperandsAvailable(Instruction &Inst,
+                                                const TargetLibraryInfo &TLI,
+                                                const ValueToShadowMap &Map);
+  PHINode *maybeCreateShadowPhi(PHINode &Phi, const TargetLibraryInfo &TLI);
+  void createShadowArguments(Function &F, const TargetLibraryInfo &TLI,
+                             ValueToShadowMap &Map);
+
+  void populateShadowStack(CallBase &CI, const TargetLibraryInfo &TLI,
+                           const ValueToShadowMap &Map);
+
+  void propagateShadowValues(Instruction &Inst, const TargetLibraryInfo &TLI,
+                             const ValueToShadowMap &Map);
+  Value *emitCheck(Value *V, Value *ShadowV, IRBuilder<> &Builder,
+                   CheckLoc Loc);
+  Value *emitCheckInternal(Value *V, Value *ShadowV, IRBuilder<> &Builder,
+                           CheckLoc Loc);
+  void emitFCmpCheck(FCmpInst &FCmp, const ValueToShadowMap &Map);
+  Value *getCalleeAddress(CallBase &Call, IRBuilder<> &Builder) const;
+
+  // Value creation handlers.
+  Value *handleLoad(LoadInst &Load, Type *VT, Type *ExtendedVT);
+  Value *handleTrunc(FPTruncInst &Trunc, Type *VT, Type *ExtendedVT,
+                     const ValueToShadowMap &Map);
+  Value *handleExt(FPExtInst &Ext, Type *VT, Type *ExtendedVT,
+                   const ValueToShadowMap &Map);
+  Value *handleCallBase(CallBase &Call, Type *VT, Type *ExtendedVT,
+                        const TargetLibraryInfo &TLI,
+                        const ValueToShadowMap &Map, IRBuilder<> &Builder);
+  Value *maybeHandleKnownCallBase(CallBase &Call, Type *VT, Type *ExtendedVT,
+                                  const TargetLibraryInfo &TLI,
+                                  const ValueToShadowMap &Map,
+                                  IRBuilder<> &Builder);
+
+  // Value propagation handlers.
+  void propagateFTStore(StoreInst &Store, Type *VT, Type *ExtendedVT,
+                        const ValueToShadowMap &Map);
+  void propagateNonFTStore(StoreInst &Store, Type *VT,
+                           const ValueToShadowMap &Map);
+
+  MappingConfig Config;
+  LLVMContext *Context = nullptr;
+  IntegerType *IntptrTy = nullptr;
+  FunctionCallee NsanGetShadowPtrForStore[FTValueType::kNumValueTypes];
+  FunctionCallee NsanGetShadowPtrForLoad[FTValueType::kNumValueTypes];
+  FunctionCallee NsanCheckValue[FTValueType::kNumValueTypes];
+  FunctionCallee NsanFCmpFail[FTValueType::kNumValueTypes];
+  FunctionCallee NsanCopyValues;
+  FunctionCallee NsanSetValueUnknown;
+  FunctionCallee NsanGetRawShadowTypePtr;
+  FunctionCallee NsanGetRawShadowPtr;
+  GlobalValue *NsanShadowRetTag;
+  GlobalValue *NsanShadowRetPtr;
+  GlobalValue *NsanShadowArgsTag;
+  GlobalValue *NsanShadowArgsPtr;
+};
+
+struct NumericalStabilitySanitizerLegacyPass : FunctionPass {
+  NumericalStabilitySanitizerLegacyPass() : FunctionPass(ID) {}
+  StringRef getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+  bool doInitialization(Module &M) override;
+  static char ID;
+
+private:
+  Optional<NumericalStabilitySanitizer> Nsan;
+};
+
+void insertModuleCtor(Module &M) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kNsanModuleCtorName, kNsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
+}
+
+} // end anonymous namespace
+
+PreservedAnalyses
+NumericalStabilitySanitizerPass::run(Function &F,
+                                     FunctionAnalysisManager &FAM) {
+  NumericalStabilitySanitizer Nsan;
+  if (Nsan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses
+NumericalStabilitySanitizerPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
+
+char NumericalStabilitySanitizerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(NumericalStabilitySanitizerLegacyPass, "nsan",
+                      "NumericalStabilitySanitizer: detects numerical errors.", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(NumericalStabilitySanitizerLegacyPass, "nsan",
+                    "NumericalStabilitySanitizer: detects numerical errors.", false,
+                    false)
+
+StringRef NumericalStabilitySanitizerLegacyPass::getPassName() const {
+  return "NumericalStabilitySanitizerLegacyPass";
+}
+
+void NumericalStabilitySanitizerLegacyPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+bool NumericalStabilitySanitizerLegacyPass::doInitialization(Module &M) {
+  insertModuleCtor(M);
+  Nsan.emplace();
+  return true;
+}
+
+bool NumericalStabilitySanitizerLegacyPass::runOnFunction(Function &F) {
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  Nsan->sanitizeFunction(F, TLI);
+  return true;
+}
+
+FunctionPass *llvm::createNumericalStabilitySanitizerLegacyPassPass() {
+  return new NumericalStabilitySanitizerLegacyPass();
+}
+
+static GlobalValue *createThreadLocalGV(const char *Name, Module &M, Type *Ty) {
+  return dyn_cast<GlobalValue>(M.getOrInsertGlobal(Name, Ty, [&M, Ty, Name] {
+    return new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                              nullptr, Name, nullptr,
+                              GlobalVariable::InitialExecTLSModel);
+  }));
+}
+
+void NumericalStabilitySanitizer::initialize(Module &M) {
+  const DataLayout &DL = M.getDataLayout();
+  Context = &M.getContext();
+  IntptrTy = DL.getIntPtrType(*Context);
+  Type *Int8PtrTy = Type::getInt8PtrTy(*Context);
+  Type *Int32Ty = Type::getInt32Ty(*Context);
+  Type *Int1Ty = Type::getInt1Ty(*Context);
+  Type *VoidTy = Type::getVoidTy(*Context);
+
+  AttributeList Attr;
+  Attr = Attr.addAttribute(*Context, AttributeList::FunctionIndex,
+                           Attribute::NoUnwind);
+  // Initialize the runtime values (functions and global variables).
+  for (int I = 0; I < kNumValueTypes; ++I) {
+    const FTValueType VT = static_cast<FTValueType>(I);
+    const char *const VTName = typeNameFromFTValueType(VT);
+    Type *const VTTy = typeFromFTValueType(VT, *Context);
+
+    // Load/store.
+    const std::string GetterPrefix =
+        std::string("__nsan_get_shadow_ptr_for_") + VTName;
+    NsanGetShadowPtrForStore[VT] = M.getOrInsertFunction(
+        GetterPrefix + "_store", Attr, Int8PtrTy, Int8PtrTy, IntptrTy);
+    NsanGetShadowPtrForLoad[VT] = M.getOrInsertFunction(
+        GetterPrefix + "_load", Attr, Int8PtrTy, Int8PtrTy, IntptrTy);
+
+    // Check.
+    const auto &ShadowConfig = Config.byValueType(VT);
+    Type *ShadowTy = ShadowConfig.getType(*Context);
+    NsanCheckValue[VT] =
+        M.getOrInsertFunction(std::string("__nsan_internal_check_") + VTName +
+                                  "_" + ShadowConfig.getNsanTypeId(),
+                              Attr, Int32Ty, VTTy, ShadowTy, Int32Ty, IntptrTy);
+    NsanFCmpFail[VT] = M.getOrInsertFunction(
+        std::string("__nsan_fcmp_fail_") + VTName + "_" +
+            ShadowConfig.getNsanTypeId(),
+        Attr, VoidTy, VTTy, VTTy, ShadowTy, ShadowTy, Int32Ty, Int1Ty, Int1Ty);
+  }
+
+  NsanCopyValues = M.getOrInsertFunction("__nsan_copy_values", Attr, VoidTy,
+                                         Int8PtrTy, Int8PtrTy, IntptrTy);
+  NsanSetValueUnknown = M.getOrInsertFunction("__nsan_set_value_unknown", Attr,
+                                              VoidTy, Int8PtrTy, IntptrTy);
+
+  // FIXME: Add attributes nofree, nosync, readnone, readonly,
+  NsanGetRawShadowTypePtr = M.getOrInsertFunction(
+      "__nsan_internal_get_raw_shadow_type_ptr", Attr, Int8PtrTy, Int8PtrTy);
+  NsanGetRawShadowPtr = M.getOrInsertFunction(
+      "__nsan_internal_get_raw_shadow_ptr", Attr, Int8PtrTy, Int8PtrTy);
+
+  NsanShadowRetTag = createThreadLocalGV("__nsan_shadow_ret_tag", M, IntptrTy);
+  NsanShadowRetPtr = createThreadLocalGV(
+      "__nsan_shadow_ret_ptr", M,
+      ArrayType::get(Type::getInt8Ty(*Context),
+                     kMaxVectorWidth * kMaxShadowTypeSizeBytes));
+
+  NsanShadowArgsTag =
+      createThreadLocalGV("__nsan_shadow_args_tag", M, IntptrTy);
+  NsanShadowArgsPtr = createThreadLocalGV(
+      "__nsan_shadow_args_ptr", M,
+      ArrayType::get(Type::getInt8Ty(*Context),
+                     kMaxVectorWidth * kMaxNumArgs * kMaxShadowTypeSizeBytes));
+}
+
+// Returns true if the given LLVM Value points to constant data (typically, a
+// global variable reference).
+bool NumericalStabilitySanitizer::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    return GV->isConstant();
+  }
+  return false;
+}
+
+// This instruments the function entry to create shadow arguments.
+// Pseudocode:
+//   if (this_fn_ptr == __nsan_shadow_args_tag) {
+//     s(arg0) = LOAD<sizeof(arg0)>(__nsan_shadow_args);
+//     s(arg1) = LOAD<sizeof(arg1)>(__nsan_shadow_args + sizeof(arg0));
+//     ...
+//     __nsan_shadow_args_tag = 0;
+//   } else {
+//     s(arg0) = fext(arg0);
+//     s(arg1) = fext(arg1);
+//     ...
+//   }
+void NumericalStabilitySanitizer::createShadowArguments(
+    Function &F, const TargetLibraryInfo &TLI, ValueToShadowMap &Map) {
+  assert(!F.getIntrinsicID() && "found a definition of an intrinsic");
+
+  // Do not bother if there are no FP args.
+  if (all_of(F.args(), [this](const Argument &Arg) {
+        return Config.getExtendedFPType(Arg.getType()) == nullptr;
+      }))
+    return;
+
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHI());
+  // The function has shadow args if the shadow args tag matches the function
+  // address.
+  Value *HasShadowArgs = Builder.CreateICmpEQ(
+      Builder.CreateLoad(IntptrTy, NsanShadowArgsTag, /*isVolatile=*/false),
+      Builder.CreatePtrToInt(&F, IntptrTy));
+
+  unsigned ShadowArgsOffsetBytes = 0;
+  for (Argument &Arg : F.args()) {
+    Type *const VT = Arg.getType();
+    Type *const ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      continue; // Not an FT value.
+    Value *Shadow = Builder.CreateSelect(
+        HasShadowArgs,
+        Builder.CreateAlignedLoad(
+            Builder.CreatePointerCast(
+                Builder.CreateConstGEP2_64(NsanShadowArgsPtr, 0,
+                                           ShadowArgsOffsetBytes),
+                ExtendedVT->getPointerTo()),
+            Align(1), /*isVolatile=*/false),
+        Builder.CreateCast(Instruction::FPExt, &Arg, ExtendedVT));
+    Map.setShadow(&Arg, Shadow);
+    TypeSize SlotSize = DL.getTypeStoreSize(ExtendedVT);
+    assert(!SlotSize.isScalable() && "unsupported");
+    ShadowArgsOffsetBytes += SlotSize.getFixedSize();
+  }
+  Builder.CreateStore(ConstantInt::get(IntptrTy, 0), NsanShadowArgsTag);
+}
+
+// Returns true if the instrumentation should emit code to check arguments
+// before a function call.
+static bool shouldCheckArgs(CallBase &CI, const TargetLibraryInfo &TLI) {
+  Function *Fn = CI.getCalledFunction();
+  if (Fn == nullptr)
+    return true; // Always check args of indirect calls.
+
+  // Never check nsan functions, the user called them for a reason.
+  if (Fn->getName().startswith("__nsan_"))
+    return false;
+
+  const auto ID = Fn->getIntrinsicID();
+  LibFunc LFunc = LibFunc::NumLibFuncs;
+  // Always check args of unknown functions.
+  if (ID == Intrinsic::ID() && !TLI.getLibFunc(*Fn, LFunc))
+    return true;
+
+  // Do not check args of an `fabs` call that is used for a comparison.
+  // This is typically used for `fabs(a-b) < tolerance`, where what matters is
+  // the result of the comparison, which is already caught be the fcmp checks.
+  if (ID == Intrinsic::fabs || LFunc == LibFunc_fabsf ||
+      LFunc == LibFunc_fabs || LFunc == LibFunc_fabsl)
+    for (const auto &U : CI.users())
+      if (isa<CmpInst>(U))
+        return false;
+
+  return true; // Default is check.
+}
+
+// Populates the shadow call stack (which contains shadow values for every
+// floating-point parameter to the function).
+void NumericalStabilitySanitizer::populateShadowStack(
+    CallBase &CI, const TargetLibraryInfo &TLI, const ValueToShadowMap &Map) {
+  // Do not create a shadow stack for inline asm.
+  if (CI.isInlineAsm())
+    return;
+
+  // Do not bother if there are no FP args.
+  if (all_of(CI.operands(), [this](const Value *Arg) {
+        return Config.getExtendedFPType(Arg->getType()) == nullptr;
+      }))
+    return;
+
+  IRBuilder<> Builder(&CI);
+  SmallVector<Value *, 8> ArgShadows;
+  const bool ShouldCheckArgs = shouldCheckArgs(CI, TLI);
+  int ArgId = -1;
+  for (Value *Arg : CI.operands()) {
+    ++ArgId;
+    if (Config.getExtendedFPType(Arg->getType()) == nullptr)
+      continue; // Not an FT value.
+    Value *ArgShadow = Map.getShadow(Arg);
+    ArgShadows.push_back(ShouldCheckArgs ? emitCheck(Arg, ArgShadow, Builder,
+                                                     CheckLoc::makeArg(ArgId))
+                                         : ArgShadow);
+  }
+
+  // Do not create shadow stacks for intrinsics/known lib funcs.
+  if (Function *Fn = CI.getCalledFunction()) {
+    LibFunc LFunc;
+    if (Fn->getIntrinsicID() || TLI.getLibFunc(*Fn, LFunc))
+      return;
+  }
+
+  const DataLayout &DL =
+      CI.getParent()->getParent()->getParent()->getDataLayout();
+  // Set the shadow stack tag.
+  Builder.CreateStore(getCalleeAddress(CI, Builder), NsanShadowArgsTag);
+  unsigned ShadowArgsOffsetBytes = 0;
+
+  unsigned ShadowArgId = 0;
+  for (const Value *Arg : CI.operands()) {
+    Type *const VT = Arg->getType();
+    Type *const ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      continue; // Not an FT value.
+    Builder.CreateAlignedStore(
+        ArgShadows[ShadowArgId++],
+        Builder.CreatePointerCast(
+            Builder.CreateConstGEP2_64(NsanShadowArgsPtr, 0,
+                                       ShadowArgsOffsetBytes),
+            ExtendedVT->getPointerTo()),
+        Align(1), /*isVolatile=*/false);
+    TypeSize SlotSize = DL.getTypeStoreSize(ExtendedVT);
+    assert(!SlotSize.isScalable() && "unsupported");
+    ShadowArgsOffsetBytes += SlotSize.getFixedSize();
+  }
+}
+
+// Internal part of emitCheck(). Returns a value that indicates whether
+// computation should continue with the shadow or resume by re-fextending the
+// value.
+enum ContinuationType { // Keep in sync with runtime.
+  kContinueWithShadow = 0,
+  kResumeFromValue = 1,
+};
+Value *NumericalStabilitySanitizer::emitCheckInternal(Value *V, Value *ShadowV,
+                                                      IRBuilder<> &Builder,
+                                                      CheckLoc Loc) {
+  // Do not emit checks for constant values, this is redundant.
+  if (isa<Constant>(V))
+    return ConstantInt::get(Builder.getInt32Ty(), kContinueWithShadow);
+
+  Type *const Ty = V->getType();
+  if (const auto VT = ftValueTypeFromType(Ty))
+    return Builder.CreateCall(
+        NsanCheckValue[*VT],
+        {V, ShadowV, Loc.getType(*Context), Loc.getValue(IntptrTy, Builder)});
+
+  if (Ty->isVectorTy()) {
+    auto *VecTy = cast<VectorType>(Ty);
+    Value *CheckResult = nullptr;
+    for (int I = 0, E = VecTy->getElementCount().getFixedValue(); I < E; ++I) {
+      // We resume if any element resumes. Another option would be to create a
+      // vector shuffle with the array of ContinueWithShadow, but that is too
+      // complex.
+      Value *ComponentCheckResult = emitCheckInternal(
+          Builder.CreateExtractElement(V, I),
+          Builder.CreateExtractElement(ShadowV, I), Builder, Loc);
+      CheckResult = CheckResult
+                        ? Builder.CreateOr(CheckResult, ComponentCheckResult)
+                        : ComponentCheckResult;
+    }
+    return CheckResult;
+  }
+  if (Ty->isArrayTy()) {
+    Value *CheckResult = nullptr;
+    for (int I = 0, E = Ty->getArrayNumElements(); I < E; ++I) {
+      Value *ComponentCheckResult = emitCheckInternal(
+          Builder.CreateExtractValue(V, I),
+          Builder.CreateExtractValue(ShadowV, I), Builder, Loc);
+      CheckResult = CheckResult
+                        ? Builder.CreateOr(CheckResult, ComponentCheckResult)
+                        : ComponentCheckResult;
+    }
+    return CheckResult;
+  }
+  if (Ty->isStructTy()) {
+    Value *CheckResult = nullptr;
+    for (int I = 0, E = Ty->getStructNumElements(); I < E; ++I) {
+      if (Config.getExtendedFPType(Ty->getStructElementType(I)) == nullptr)
+        continue; // Only check FT values.
+      Value *ComponentCheckResult = emitCheckInternal(
+          Builder.CreateExtractValue(V, I),
+          Builder.CreateExtractValue(ShadowV, I), Builder, Loc);
+      CheckResult = CheckResult
+                        ? Builder.CreateOr(CheckResult, ComponentCheckResult)
+                        : ComponentCheckResult;
+    }
+    assert(CheckResult && "struct with no FT element");
+    return CheckResult;
+  }
+
+  llvm_unreachable("not implemented");
+}
+
+// Inserts a runtime check of V against its shadow value ShadowV.
+// We check values whenever they escape: on return, call, stores, and
+// insertvalue.
+// Returns the shadow value that should be used to continue the computations,
+// depending on the answer from the runtime.
+// FIXME: Should we check on select ? phi ?
+Value *NumericalStabilitySanitizer::emitCheck(Value *V, Value *ShadowV,
+                                              IRBuilder<> &Builder,
+                                              CheckLoc Loc) {
+  // Do not emit checks for constant values, this is redundant.
+  if (isa<Constant>(V))
+    return ShadowV;
+
+  Value *CheckResult = emitCheckInternal(V, ShadowV, Builder, Loc);
+  return Builder.CreateSelect(
+      Builder.CreateICmpEQ(CheckResult, ConstantInt::get(Builder.getInt32Ty(),
+                                                         kResumeFromValue)),
+      Builder.CreateCast(Instruction::FPExt, V,
+                         Config.getExtendedFPType(V->getType())),
+      ShadowV);
+}
+
+static Instruction *getNextInstructionOrDie(Instruction &Inst) {
+  assert(Inst.getNextNode() && "instruction is a terminator");
+  return Inst.getNextNode();
+}
+
+// Inserts a check that fcmp on shadow values are consistent with that on base
+// values.
+void NumericalStabilitySanitizer::emitFCmpCheck(FCmpInst &FCmp,
+                                                const ValueToShadowMap &Map) {
+  if (!ClInstrumentFCmp)
+    return;
+  Value *LHS = FCmp.getOperand(0);
+  if (Config.getExtendedFPType(LHS->getType()) == nullptr)
+    return;
+  Value *RHS = FCmp.getOperand(1);
+
+  // Split the basic block. On mismatch, we'll jump to the new basic block with
+  // a call to the runtime for error reporting.
+  BasicBlock *FCmpBB = FCmp.getParent();
+  BasicBlock *NextBB = FCmpBB->splitBasicBlock(getNextInstructionOrDie(FCmp));
+  // Remove the newly created terminator unconditional branch.
+  FCmpBB->getInstList().erase(FCmpBB->back());
+  BasicBlock *FailBB =
+      BasicBlock::Create(*Context, "", FCmpBB->getParent(), NextBB);
+
+  // Create the shadow fcmp and comparison between the fcmps.
+  IRBuilder<> FCmpBuilder(FCmpBB);
+  FCmpBuilder.SetCurrentDebugLocation(FCmp.getDebugLoc());
+  Value *ShadowLHS = Map.getShadow(LHS);
+  Value *ShadowRHS = Map.getShadow(RHS);
+  // See comment on ClTruncateFCmpEq.
+  if (FCmp.isEquality() && ClTruncateFCmpEq) {
+    Type *Ty = ShadowLHS->getType();
+    ShadowLHS = FCmpBuilder.CreateCast(
+        Instruction::FPExt,
+        FCmpBuilder.CreateCast(Instruction::FPTrunc, ShadowLHS, LHS->getType()),
+        Ty);
+    ShadowRHS = FCmpBuilder.CreateCast(
+        Instruction::FPExt,
+        FCmpBuilder.CreateCast(Instruction::FPTrunc, ShadowRHS, RHS->getType()),
+        Ty);
+  }
+  Value *ShadowFCmp =
+      FCmpBuilder.CreateFCmp(FCmp.getPredicate(), ShadowLHS, ShadowRHS);
+  Value *OriginalAndShadowFcmpMatch =
+      FCmpBuilder.CreateICmpEQ(&FCmp, ShadowFCmp);
+
+  if (OriginalAndShadowFcmpMatch->getType()->isVectorTy()) {
+    // If we have a vector type, `OriginalAndShadowFcmpMatch` is a vector of i1,
+    // where an element is true if the corresponding elements in original and
+    // shadow are the same. We want all elements to be 1.
+    OriginalAndShadowFcmpMatch =
+        FCmpBuilder.CreateAndReduce(OriginalAndShadowFcmpMatch);
+  }
+
+  FCmpBuilder.CreateCondBr(OriginalAndShadowFcmpMatch, NextBB, FailBB);
+
+  // Fill in FailBB.
+  IRBuilder<> FailBuilder(FailBB);
+  FailBuilder.SetCurrentDebugLocation(FCmp.getDebugLoc());
+
+  const auto EmitFailCall = [this, &FCmp, &FCmpBuilder,
+                             &FailBuilder](Value *L, Value *R, Value *ShadowL,
+                                           Value *ShadowR, Value *Result,
+                                           Value *ShadowResult) {
+    Type *FT = L->getType();
+    FunctionCallee *Callee = nullptr;
+    if (FT->isFloatTy()) {
+      Callee = &(NsanFCmpFail[kFloat]);
+    } else if (FT->isDoubleTy()) {
+      Callee = &(NsanFCmpFail[kDouble]);
+    } else if (FT->isX86_FP80Ty()) {
+      // FIXME: make NsanFCmpFailLongDouble work.
+      Callee = &(NsanFCmpFail[kDouble]);
+      L = FailBuilder.CreateCast(Instruction::FPTrunc, L,
+                                 Type::getDoubleTy(*Context));
+      R = FailBuilder.CreateCast(Instruction::FPTrunc, L,
+                                 Type::getDoubleTy(*Context));
+    } else {
+      llvm_unreachable("not implemented");
+    }
+    FailBuilder.CreateCall(*Callee, {L, R, ShadowL, ShadowR,
+                                     ConstantInt::get(FCmpBuilder.getInt32Ty(),
+                                                      FCmp.getPredicate()),
+                                     Result, ShadowResult});
+  };
+  if (LHS->getType()->isVectorTy()) {
+    for (int I = 0, E = cast<VectorType>(LHS->getType())->getElementCount().getFixedValue();
+         I < E; ++I) {
+      EmitFailCall(FailBuilder.CreateExtractElement(LHS, I),
+                   FailBuilder.CreateExtractElement(RHS, I),
+                   FailBuilder.CreateExtractElement(ShadowLHS, I),
+                   FailBuilder.CreateExtractElement(ShadowRHS, I),
+                   FailBuilder.CreateExtractElement(&FCmp, I),
+                   FailBuilder.CreateExtractElement(ShadowFCmp, I));
+    }
+  } else {
+    EmitFailCall(LHS, RHS, ShadowLHS, ShadowRHS, &FCmp, ShadowFCmp);
+  }
+  FailBuilder.CreateBr(NextBB);
+
+  ++NumInstrumentedFCmp;
+}
+
+// Creates a shadow phi value for any phi that defines a value of FT type.
+PHINode *NumericalStabilitySanitizer::maybeCreateShadowPhi(
+    PHINode &Phi, const TargetLibraryInfo &TLI) {
+  Type *const VT = Phi.getType();
+  Type *const ExtendedVT = Config.getExtendedFPType(VT);
+  if (ExtendedVT == nullptr)
+    return nullptr; // Not an FT value.
+  // The phi operands are shadow values and are not available when the phi is
+  // created. They will be populated in a final phase, once all shadow values
+  // have been created.
+  PHINode *Shadow = PHINode::Create(ExtendedVT, Phi.getNumIncomingValues());
+  Shadow->insertAfter(&Phi);
+  return Shadow;
+}
+
+Value *NumericalStabilitySanitizer::handleLoad(LoadInst &Load, Type *VT,
+                                               Type *ExtendedVT) {
+  IRBuilder<> Builder(getNextInstructionOrDie(Load));
+  Builder.SetCurrentDebugLocation(Load.getDebugLoc());
+  if (addrPointsToConstantData(Load.getPointerOperand())) {
+    // No need to look into the shadow memory, the value is a constant. Just
+    // convert from FT to 2FT.
+    return Builder.CreateCast(Instruction::FPExt, &Load, ExtendedVT);
+  }
+
+  // if (%shadowptr == &)
+  //    %shadow = fpext %v
+  // else
+  //    %shadow = load (ptrcast %shadow_ptr))
+  // Considered options here:
+  //  - Have `NsanGetShadowPtrForLoad` return a fixed address
+  //    &__nsan_unknown_value_shadow_address that is valid to load from, and
+  //    use a select. This has the advantage that the generated IR is simpler.
+  //  - Have `NsanGetShadowPtrForLoad` return nullptr.  Because `select` does
+  //    not short-circuit, dereferencing the returned pointer is no longer an
+  //    option, have to split and create a separate basic block. This has the
+  //    advantage of being easier to debug because it crashes if we ever mess
+  //    up.
+
+  const auto Extents = getMemoryExtentsOrDie(VT);
+  Value *ShadowPtr =
+      Builder.CreateCall(NsanGetShadowPtrForLoad[Extents.ValueType],
+                         {Builder.CreatePointerCast(Load.getPointerOperand(),
+                                                    Builder.getInt8PtrTy()),
+                          ConstantInt::get(IntptrTy, Extents.NumElts)});
+  ++NumInstrumentedFTLoads;
+
+#if 0
+  // Emit a select.
+  return Builder.CreateSelect(
+      Builder.CreateICmpEq(ShadowPtr, NsanUnknownValueShadowAddress),
+      Builder.CreateCast(Instruction::FPExt, &Load, ExtendedVT),
+      Builder.CreateAlignedLoad(
+          Builder.CreatePointerCast(ShadowPtr, ExtendedVT->getPointerTo()),
+          Align(1), Load.isVolatile()));
+#else
+  // Split the basic block.
+  BasicBlock *LoadBB = Load.getParent();
+  BasicBlock *NextBB = LoadBB->splitBasicBlock(Builder.GetInsertPoint());
+  // Create the two options for creating the shadow value.
+  BasicBlock *ShadowLoadBB =
+      BasicBlock::Create(*Context, "", LoadBB->getParent(), NextBB);
+  BasicBlock *FExtBB =
+      BasicBlock::Create(*Context, "", LoadBB->getParent(), NextBB);
+
+  // Replace the newly created terminator unconditional branch by a conditional
+  // branch to one of the options.
+  {
+    LoadBB->getInstList().erase(LoadBB->back());
+    IRBuilder<> LoadBBBuilder(LoadBB); // The old builder has been invalidated.
+    LoadBBBuilder.SetCurrentDebugLocation(Load.getDebugLoc());
+    LoadBBBuilder.CreateCondBr(LoadBBBuilder.CreateIsNull(ShadowPtr), FExtBB,
+                               ShadowLoadBB);
+  }
+
+  // Fill in ShadowLoadBB.
+  IRBuilder<> ShadowLoadBBBuilder(ShadowLoadBB);
+  ShadowLoadBBBuilder.SetCurrentDebugLocation(Load.getDebugLoc());
+  Value *ShadowLoad = ShadowLoadBBBuilder.CreateAlignedLoad(
+      ShadowLoadBBBuilder.CreatePointerCast(ShadowPtr,
+                                            ExtendedVT->getPointerTo()),
+      Align(1), Load.isVolatile());
+  if (ClCheckLoads) {
+    ShadowLoad = emitCheck(&Load, ShadowLoad, ShadowLoadBBBuilder,
+                           CheckLoc::makeLoad(Load.getPointerOperand()));
+  }
+  ShadowLoadBBBuilder.CreateBr(NextBB);
+
+  // Fill in FExtBB.
+  IRBuilder<> FExtBBBuilder(FExtBB);
+  FExtBBBuilder.SetCurrentDebugLocation(Load.getDebugLoc());
+  Value *const FExt =
+      FExtBBBuilder.CreateCast(Instruction::FPExt, &Load, ExtendedVT);
+  FExtBBBuilder.CreateBr(NextBB);
+
+  // The shadow value come from any of the options.
+  IRBuilder<> NextBBBuilder(&*NextBB->begin());
+  NextBBBuilder.SetCurrentDebugLocation(Load.getDebugLoc());
+  PHINode *ShadowPhi = NextBBBuilder.CreatePHI(ExtendedVT, 2);
+  ShadowPhi->addIncoming(ShadowLoad, ShadowLoadBB);
+  ShadowPhi->addIncoming(FExt, FExtBB);
+  return ShadowPhi;
+#endif
+}
+
+Value *NumericalStabilitySanitizer::handleTrunc(FPTruncInst &Trunc, Type *VT,
+                                                Type *ExtendedVT,
+                                                const ValueToShadowMap &Map) {
+  Value *const OrigSource = Trunc.getOperand(0);
+  Type *const OrigSourceTy = OrigSource->getType();
+  Type *const ExtendedSourceTy = Config.getExtendedFPType(OrigSourceTy);
+
+  // When truncating:
+  //  - (A) If the source has a shadow, we truncate from the shadow, else we
+  //    truncate from the original source.
+  //  - (B) If the shadow of the source is larger than the shadow of the dest,
+  //    we still need a truncate. Else, the shadow of the source is the same
+  //    type as the shadow of the dest (because mappings are non-decreasing), so
+  //   we don't need to emit a truncate.
+  // Examples,
+  //   with a mapping of {f32->f64;f64->f80;f80->f128}
+  //     fptrunc double   %1 to float     ->  fptrunc x86_fp80 s(%1) to double
+  //     fptrunc x86_fp80 %1 to float     ->  fptrunc fp128    s(%1) to double
+  //     fptrunc fp128    %1 to float     ->  fptrunc fp128    %1    to double
+  //     fptrunc x86_fp80 %1 to double    ->  x86_fp80 s(%1)
+  //     fptrunc fp128    %1 to double    ->  fptrunc fp128 %1 to x86_fp80
+  //     fptrunc fp128    %1 to x86_fp80  ->  fp128 %1
+  //   with a mapping of {f32->f64;f64->f128;f80->f128}
+  //     fptrunc double   %1 to float     ->  fptrunc fp128    s(%1) to double
+  //     fptrunc x86_fp80 %1 to float     ->  fptrunc fp128    s(%1) to double
+  //     fptrunc fp128    %1 to float     ->  fptrunc fp128    %1    to double
+  //     fptrunc x86_fp80 %1 to double    ->  fp128 %1
+  //     fptrunc fp128    %1 to double    ->  fp128 %1
+  //     fptrunc fp128    %1 to x86_fp80  ->  fp128 %1
+  //   with a mapping of {f32->f32;f64->f32;f80->f64}
+  //     fptrunc double   %1 to float     ->  float s(%1)
+  //     fptrunc x86_fp80 %1 to float     ->  fptrunc double    s(%1) to float
+  //     fptrunc fp128    %1 to float     ->  fptrunc fp128     %1    to float
+  //     fptrunc x86_fp80 %1 to double    ->  fptrunc double    s(%1) to float
+  //     fptrunc fp128    %1 to double    ->  fptrunc fp128     %1    to float
+  //     fptrunc fp128    %1 to x86_fp80  ->  fptrunc fp128     %1    to double
+
+  // See (A) above.
+  Value *const Source =
+      ExtendedSourceTy ? Map.getShadow(OrigSource) : OrigSource;
+  Type *const SourceTy = ExtendedSourceTy ? ExtendedSourceTy : OrigSourceTy;
+  // See (B) above.
+  if (SourceTy == ExtendedVT)
+    return Source;
+
+  Instruction *const Shadow =
+      CastInst::Create(Instruction::FPTrunc, Source, ExtendedVT);
+  Shadow->insertAfter(&Trunc);
+  return Shadow;
+}
+
+Value *NumericalStabilitySanitizer::handleExt(FPExtInst &Ext, Type *VT,
+                                              Type *ExtendedVT,
+                                              const ValueToShadowMap &Map) {
+  Value *const OrigSource = Ext.getOperand(0);
+  Type *const OrigSourceTy = OrigSource->getType();
+  Type *const ExtendedSourceTy = Config.getExtendedFPType(OrigSourceTy);
+  // When extending:
+  //  - (A) If the source has a shadow, we extend from the shadow, else we
+  //    extend from the original source.
+  //  - (B) If the shadow of the dest is larger than the shadow of the source,
+  //    we still need an extend. Else, the shadow of the source is the same
+  //    type as the shadow of the dest (because mappings are non-decreasing), so
+  //    we don't need to emit an extend.
+  // Examples,
+  //   with a mapping of {f32->f64;f64->f80;f80->f128}
+  //     fpext half    %1 to float     ->  fpext half     %1    to double
+  //     fpext half    %1 to double    ->  fpext half     %1    to x86_fp80
+  //     fpext half    %1 to x86_fp80  ->  fpext half     %1    to fp128
+  //     fpext float   %1 to double    ->  double s(%1)
+  //     fpext float   %1 to x86_fp80  ->  fpext double   s(%1) to fp128
+  //     fpext double  %1 to x86_fp80  ->  fpext x86_fp80 s(%1) to fp128
+  //   with a mapping of {f32->f64;f64->f128;f80->f128}
+  //     fpext half    %1 to float     ->  fpext half     %1    to double
+  //     fpext half    %1 to double    ->  fpext half     %1    to fp128
+  //     fpext half    %1 to x86_fp80  ->  fpext half     %1    to fp128
+  //     fpext float   %1 to double    ->  fpext double   s(%1) to fp128
+  //     fpext float   %1 to x86_fp80  ->  fpext double   s(%1) to fp128
+  //     fpext double  %1 to x86_fp80  ->  fp128 s(%1)
+  //   with a mapping of {f32->f32;f64->f32;f80->f64}
+  //     fpext half    %1 to float     ->  fpext half     %1    to float
+  //     fpext half    %1 to double    ->  fpext half     %1    to float
+  //     fpext half    %1 to x86_fp80  ->  fpext half     %1    to double
+  //     fpext float   %1 to double    ->  s(%1)
+  //     fpext float   %1 to x86_fp80  ->  fpext float    s(%1) to double
+  //     fpext double  %1 to x86_fp80  ->  fpext float    s(%1) to double
+
+  // See (A) above.
+  Value *const Source =
+      ExtendedSourceTy ? Map.getShadow(OrigSource) : OrigSource;
+  Type *const SourceTy = ExtendedSourceTy ? ExtendedSourceTy : OrigSourceTy;
+  // See (B) above.
+  if (SourceTy == ExtendedVT)
+    return Source;
+
+  Instruction *const Shadow =
+      CastInst::Create(Instruction::FPExt, Source, ExtendedVT);
+  Shadow->insertAfter(&Ext);
+  return Shadow;
+}
+
+// Returns a value with the address of the callee.
+Value *
+NumericalStabilitySanitizer::getCalleeAddress(CallBase &Call,
+                                              IRBuilder<> &Builder) const {
+  if (Function *Fn = Call.getCalledFunction()) {
+    // We're calling a statically known function.
+    return Builder.CreatePtrToInt(Fn, IntptrTy);
+  } else {
+    // We're calling a function through a function pointer.
+    return Builder.CreatePtrToInt(Call.getCalledOperand(), IntptrTy);
+  }
+}
+
+namespace {
+
+// FIXME: This should be tablegen-ed.
+
+struct KnownIntrinsic {
+  struct WidenedIntrinsic {
+    const char *NarrowName;
+    Intrinsic::ID ID; // wide id.
+    using FnTypeFactory = FunctionType *(*)(LLVMContext &);
+    FnTypeFactory MakeFnTy;
+  };
+
+  static const char *get(LibFunc LFunc);
+
+  // Given an intrinsic with an `FT` argument, try to find a wider intrinsic
+  // that applies the same operation on the shadow argument.
+  // Options are:
+  //  - pass in the ID and full function type,
+  //  - pass in the name, which includes the function type through mangling.
+  static const WidenedIntrinsic *widen(StringRef Name);
+
+private:
+  struct LFEntry {
+    LibFunc LFunc;
+    const char *IntrinsicName;
+  };
+  static const LFEntry kLibfuncIntrinsics[];
+
+  static const WidenedIntrinsic kWidenedIntrinsics[];
+};
+
+FunctionType *Make_Double_Double(LLVMContext &C) {
+  return FunctionType::get(Type::getDoubleTy(C), {Type::getDoubleTy(C)}, false);
+}
+
+FunctionType *Make_X86FP80_X86FP80(LLVMContext &C) {
+  return FunctionType::get(Type::getX86_FP80Ty(C), {Type::getX86_FP80Ty(C)},
+                           false);
+}
+
+FunctionType *Make_Double_DoubleI32(LLVMContext &C) {
+  return FunctionType::get(Type::getDoubleTy(C),
+                           {Type::getDoubleTy(C), Type::getInt32Ty(C)}, false);
+}
+
+FunctionType *Make_X86FP80_X86FP80I32(LLVMContext &C) {
+  return FunctionType::get(Type::getX86_FP80Ty(C),
+                           {Type::getX86_FP80Ty(C), Type::getInt32Ty(C)},
+                           false);
+}
+
+FunctionType *Make_Double_DoubleDouble(LLVMContext &C) {
+  return FunctionType::get(Type::getDoubleTy(C),
+                           {Type::getDoubleTy(C), Type::getDoubleTy(C)}, false);
+}
+
+FunctionType *Make_X86FP80_X86FP80X86FP80(LLVMContext &C) {
+  return FunctionType::get(Type::getX86_FP80Ty(C),
+                           {Type::getX86_FP80Ty(C), Type::getX86_FP80Ty(C)},
+                           false);
+}
+
+FunctionType *Make_Double_DoubleDoubleDouble(LLVMContext &C) {
+  return FunctionType::get(
+      Type::getDoubleTy(C),
+      {Type::getDoubleTy(C), Type::getDoubleTy(C), Type::getDoubleTy(C)},
+      false);
+}
+
+FunctionType *Make_X86FP80_X86FP80X86FP80X86FP80(LLVMContext &C) {
+  return FunctionType::get(
+      Type::getX86_FP80Ty(C),
+      {Type::getX86_FP80Ty(C), Type::getX86_FP80Ty(C), Type::getX86_FP80Ty(C)},
+      false);
+}
+
+const KnownIntrinsic::WidenedIntrinsic KnownIntrinsic::kWidenedIntrinsics[] = {
+    // FIXME: Right now we ignore vector intrinsics.
+    // This is hard because we have to model the semantics of the intrinsics,
+    // e.g. llvm.x86.sse2.min.sd means extract first element, min, insert back.
+    // Intrinsics that take any non-vector FT types:
+    // NOTE: Right now because of https://bugs.llvm.org/show_bug.cgi?id=45399
+    // for f128 we need to use Make_X86FP80_X86FP80 (go to a lower precision and
+    // come back).
+    {"llvm.sqrt.f32", Intrinsic::sqrt, Make_Double_Double},
+    {"llvm.sqrt.f64", Intrinsic::sqrt, Make_X86FP80_X86FP80},
+    {"llvm.sqrt.f80", Intrinsic::sqrt, Make_X86FP80_X86FP80},
+    {"llvm.powi.f32", Intrinsic::powi, Make_Double_DoubleI32},
+    {"llvm.powi.f64", Intrinsic::powi, Make_X86FP80_X86FP80I32},
+    {"llvm.powi.f80", Intrinsic::powi, Make_X86FP80_X86FP80I32},
+    {"llvm.sin.f32", Intrinsic::sin, Make_Double_Double},
+    {"llvm.sin.f64", Intrinsic::sin, Make_X86FP80_X86FP80},
+    {"llvm.sin.f80", Intrinsic::sin, Make_X86FP80_X86FP80},
+    {"llvm.cos.f32", Intrinsic::cos, Make_Double_Double},
+    {"llvm.cos.f64", Intrinsic::cos, Make_X86FP80_X86FP80},
+    {"llvm.cos.f80", Intrinsic::cos, Make_X86FP80_X86FP80},
+    {"llvm.pow.f32", Intrinsic::pow, Make_Double_DoubleDouble},
+    {"llvm.pow.f64", Intrinsic::pow, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.pow.f80", Intrinsic::pow, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.exp.f32", Intrinsic::exp, Make_Double_Double},
+    {"llvm.exp.f64", Intrinsic::exp, Make_X86FP80_X86FP80},
+    {"llvm.exp.f80", Intrinsic::exp, Make_X86FP80_X86FP80},
+    {"llvm.exp2.f32", Intrinsic::exp2, Make_Double_Double},
+    {"llvm.exp2.f64", Intrinsic::exp2, Make_X86FP80_X86FP80},
+    {"llvm.exp2.f80", Intrinsic::exp2, Make_X86FP80_X86FP80},
+    {"llvm.log.f32", Intrinsic::log, Make_Double_Double},
+    {"llvm.log.f64", Intrinsic::log, Make_X86FP80_X86FP80},
+    {"llvm.log.f80", Intrinsic::log, Make_X86FP80_X86FP80},
+    {"llvm.log10.f32", Intrinsic::log10, Make_Double_Double},
+    {"llvm.log10.f64", Intrinsic::log10, Make_X86FP80_X86FP80},
+    {"llvm.log10.f80", Intrinsic::log10, Make_X86FP80_X86FP80},
+    {"llvm.log2.f32", Intrinsic::log2, Make_Double_Double},
+    {"llvm.log2.f64", Intrinsic::log2, Make_X86FP80_X86FP80},
+    {"llvm.log2.f80", Intrinsic::log2, Make_X86FP80_X86FP80},
+    {"llvm.fma.f32", Intrinsic::fma, Make_Double_DoubleDoubleDouble},
+    {"llvm.fma.f64", Intrinsic::fma, Make_X86FP80_X86FP80X86FP80X86FP80},
+    {"llvm.fma.f80", Intrinsic::fma, Make_X86FP80_X86FP80X86FP80X86FP80},
+    {"llvm.fabs.f32", Intrinsic::fabs, Make_Double_Double},
+    {"llvm.fabs.f64", Intrinsic::fabs, Make_X86FP80_X86FP80},
+    {"llvm.fabs.f80", Intrinsic::fabs, Make_X86FP80_X86FP80},
+    {"llvm.minnum.f32", Intrinsic::minnum, Make_Double_DoubleDouble},
+    {"llvm.minnum.f64", Intrinsic::minnum, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.minnum.f80", Intrinsic::minnum, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.maxnum.f32", Intrinsic::maxnum, Make_Double_DoubleDouble},
+    {"llvm.maxnum.f64", Intrinsic::maxnum, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.maxnum.f80", Intrinsic::maxnum, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.minimum.f32", Intrinsic::minimum, Make_Double_DoubleDouble},
+    {"llvm.minimum.f64", Intrinsic::minimum, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.minimum.f80", Intrinsic::minimum, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.maximum.f32", Intrinsic::maximum, Make_Double_DoubleDouble},
+    {"llvm.maximum.f64", Intrinsic::maximum, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.maximum.f80", Intrinsic::maximum, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.copysign.f32", Intrinsic::copysign, Make_Double_DoubleDouble},
+    {"llvm.copysign.f64", Intrinsic::copysign, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.copysign.f80", Intrinsic::copysign, Make_X86FP80_X86FP80X86FP80},
+    {"llvm.floor.f32", Intrinsic::floor, Make_Double_Double},
+    {"llvm.floor.f64", Intrinsic::floor, Make_X86FP80_X86FP80},
+    {"llvm.floor.f80", Intrinsic::floor, Make_X86FP80_X86FP80},
+    {"llvm.ceil.f32", Intrinsic::ceil, Make_Double_Double},
+    {"llvm.ceil.f64", Intrinsic::ceil, Make_X86FP80_X86FP80},
+    {"llvm.ceil.f80", Intrinsic::ceil, Make_X86FP80_X86FP80},
+    {"llvm.trunc.f32", Intrinsic::trunc, Make_Double_Double},
+    {"llvm.trunc.f64", Intrinsic::trunc, Make_X86FP80_X86FP80},
+    {"llvm.trunc.f80", Intrinsic::trunc, Make_X86FP80_X86FP80},
+    {"llvm.rint.f32", Intrinsic::rint, Make_Double_Double},
+    {"llvm.rint.f64", Intrinsic::rint, Make_X86FP80_X86FP80},
+    {"llvm.rint.f80", Intrinsic::rint, Make_X86FP80_X86FP80},
+    {"llvm.nearbyint.f32", Intrinsic::nearbyint, Make_Double_Double},
+    {"llvm.nearbyint.f64", Intrinsic::nearbyint, Make_X86FP80_X86FP80},
+    {"llvm.nearbyin80f64", Intrinsic::nearbyint, Make_X86FP80_X86FP80},
+    {"llvm.round.f32", Intrinsic::round, Make_Double_Double},
+    {"llvm.round.f64", Intrinsic::round, Make_X86FP80_X86FP80},
+    {"llvm.round.f80", Intrinsic::round, Make_X86FP80_X86FP80},
+    {"llvm.lround.f32", Intrinsic::lround, Make_Double_Double},
+    {"llvm.lround.f64", Intrinsic::lround, Make_X86FP80_X86FP80},
+    {"llvm.lround.f80", Intrinsic::lround, Make_X86FP80_X86FP80},
+    {"llvm.llround.f32", Intrinsic::llround, Make_Double_Double},
+    {"llvm.llround.f64", Intrinsic::llround, Make_X86FP80_X86FP80},
+    {"llvm.llround.f80", Intrinsic::llround, Make_X86FP80_X86FP80},
+    {"llvm.lrint.f32", Intrinsic::lrint, Make_Double_Double},
+    {"llvm.lrint.f64", Intrinsic::lrint, Make_X86FP80_X86FP80},
+    {"llvm.lrint.f80", Intrinsic::lrint, Make_X86FP80_X86FP80},
+    {"llvm.llrint.f32", Intrinsic::llrint, Make_Double_Double},
+    {"llvm.llrint.f64", Intrinsic::llrint, Make_X86FP80_X86FP80},
+    {"llvm.llrint.f80", Intrinsic::llrint, Make_X86FP80_X86FP80},
+};
+
+const KnownIntrinsic::LFEntry KnownIntrinsic::kLibfuncIntrinsics[] = {
+    {LibFunc_sqrtf, "llvm.sqrt.f32"},           //
+    {LibFunc_sqrt, "llvm.sqrt.f64"},            //
+    {LibFunc_sqrtl, "llvm.sqrt.f80"},           //
+    {LibFunc_sinf, "llvm.sin.f32"},             //
+    {LibFunc_sin, "llvm.sin.f64"},              //
+    {LibFunc_sinl, "llvm.sin.f80"},             //
+    {LibFunc_cosf, "llvm.cos.f32"},             //
+    {LibFunc_cos, "llvm.cos.f64"},              //
+    {LibFunc_cosl, "llvm.cos.f80"},             //
+    {LibFunc_powf, "llvm.pow.f32"},             //
+    {LibFunc_pow, "llvm.pow.f64"},              //
+    {LibFunc_powl, "llvm.pow.f80"},             //
+    {LibFunc_expf, "llvm.exp.f32"},             //
+    {LibFunc_exp, "llvm.exp.f64"},              //
+    {LibFunc_expl, "llvm.exp.f80"},             //
+    {LibFunc_exp2f, "llvm.exp2.f32"},           //
+    {LibFunc_exp2, "llvm.exp2.f64"},            //
+    {LibFunc_exp2l, "llvm.exp2.f80"},           //
+    {LibFunc_logf, "llvm.log.f32"},             //
+    {LibFunc_log, "llvm.log.f64"},              //
+    {LibFunc_logl, "llvm.log.f80"},             //
+    {LibFunc_log10f, "llvm.log10.f32"},         //
+    {LibFunc_log10, "llvm.log10.f64"},          //
+    {LibFunc_log10l, "llvm.log10.f80"},         //
+    {LibFunc_log2f, "llvm.log2.f32"},           //
+    {LibFunc_log2, "llvm.log2.f64"},            //
+    {LibFunc_log2l, "llvm.log2.f80"},           //
+    {LibFunc_fabsf, "llvm.fabs.f32"},           //
+    {LibFunc_fabs, "llvm.fabs.f64"},            //
+    {LibFunc_fabsl, "llvm.fabs.f80"},           //
+    {LibFunc_copysignf, "llvm.copysign.f32"},   //
+    {LibFunc_copysign, "llvm.copysign.f64"},    //
+    {LibFunc_copysignl, "llvm.copysign.f80"},   //
+    {LibFunc_floorf, "llvm.floor.f32"},         //
+    {LibFunc_floor, "llvm.floor.f64"},          //
+    {LibFunc_floorl, "llvm.floor.f80"},         //
+    {LibFunc_fmaxf, "llvm.maxnum.f32"},         //
+    {LibFunc_fmax, "llvm.maxnum.f64"},          //
+    {LibFunc_fmaxl, "llvm.maxnum.f80"},         //
+    {LibFunc_fminf, "llvm.minnum.f32"},         //
+    {LibFunc_fmin, "llvm.minnum.f64"},          //
+    {LibFunc_fminl, "llvm.minnum.f80"},         //
+    {LibFunc_ceilf, "llvm.ceil.f32"},           //
+    {LibFunc_ceil, "llvm.ceil.f64"},            //
+    {LibFunc_ceill, "llvm.ceil.f80"},           //
+    {LibFunc_truncf, "llvm.trunc.f32"},         //
+    {LibFunc_trunc, "llvm.trunc.f64"},          //
+    {LibFunc_truncl, "llvm.trunc.f80"},         //
+    {LibFunc_rintf, "llvm.rint.f32"},           //
+    {LibFunc_rint, "llvm.rint.f64"},            //
+    {LibFunc_rintl, "llvm.rint.f80"},           //
+    {LibFunc_nearbyintf, "llvm.nearbyint.f32"}, //
+    {LibFunc_nearbyint, "llvm.nearbyint.f64"},  //
+    {LibFunc_nearbyintl, "llvm.nearbyint.f80"}, //
+    {LibFunc_roundf, "llvm.round.f32"},         //
+    {LibFunc_round, "llvm.round.f64"},          //
+    {LibFunc_roundl, "llvm.round.f80"},         //
+};
+
+const char *KnownIntrinsic::get(LibFunc LFunc) {
+  for (const auto &E : kLibfuncIntrinsics) {
+    if (E.LFunc == LFunc)
+      return E.IntrinsicName;
+  }
+  return nullptr;
+}
+
+const KnownIntrinsic::WidenedIntrinsic *KnownIntrinsic::widen(StringRef Name) {
+  for (const auto &E : kWidenedIntrinsics) {
+    if (E.NarrowName == Name)
+      return &E;
+  }
+  return nullptr;
+}
+
+} // namespace
+
+// Returns the name of the LLVM intrinsic corresponding to the given function.
+static const char *getIntrinsicFromLibfunc(Function &Fn, Type *VT,
+                                           const TargetLibraryInfo &TLI) {
+  LibFunc LFunc;
+  if (!TLI.getLibFunc(Fn, LFunc))
+    return nullptr;
+
+  if (const char *Name = KnownIntrinsic::get(LFunc))
+    return Name;
+
+  errs() << "FIXME: LibFunc: " << TLI.getName(LFunc) << "\n";
+  return nullptr;
+}
+
+// Try to handle a known function call.
+Value *NumericalStabilitySanitizer::maybeHandleKnownCallBase(
+    CallBase &Call, Type *VT, Type *ExtendedVT, const TargetLibraryInfo &TLI,
+    const ValueToShadowMap &Map, IRBuilder<> &Builder) {
+  Function *const Fn = Call.getCalledFunction();
+  if (Fn == nullptr)
+    return nullptr;
+
+  Intrinsic::ID WidenedId = Intrinsic::ID();
+  FunctionType *WidenedFnTy = nullptr;
+  if (const auto ID = Fn->getIntrinsicID()) {
+    const auto *const Widened = KnownIntrinsic::widen(Fn->getName());
+    if (Widened) {
+      WidenedId = Widened->ID;
+      WidenedFnTy = Widened->MakeFnTy(*Context);
+    } else {
+      // If we don't know how to widen the intrinsic, we have no choice but to
+      // call the non-wide version on a truncated shadow and extend again
+      // afterwards.
+      WidenedId = ID;
+      WidenedFnTy = Fn->getFunctionType();
+    }
+  } else if (const char *Name = getIntrinsicFromLibfunc(*Fn, VT, TLI)) {
+    // We might have a call to a library function that we can replace with a
+    // wider Intrinsic.
+    const auto *Widened = KnownIntrinsic::widen(Name);
+    assert(Widened && "make sure KnownIntrinsic entries are consistent");
+    WidenedId = Widened->ID;
+    WidenedFnTy = Widened->MakeFnTy(*Context);
+  } else {
+    // This is not a known library function or intrinsic.
+    return nullptr;
+  }
+
+  // Check that the widened intrinsic is valid.
+  SmallVector<Intrinsic::IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(WidenedId, Table);
+  SmallVector<Type *, 4> ArgTys;
+  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+  const Intrinsic::MatchIntrinsicTypesResult Res =
+      Intrinsic::matchIntrinsicSignature(WidenedFnTy, TableRef, ArgTys);
+  assert(Res == Intrinsic::MatchIntrinsicTypes_Match &&
+         "invalid widened intrinsic");
+  (void)Res;
+
+  // For known intrinsic functions, we create a second call to the same
+  // intrinsic with a different type.
+  SmallVector<Value *, 4> Args;
+  // The last operand is the intrinsic itself, skip it.
+  for (unsigned I = 0, E = Call.getNumOperands() - 1; I < E; ++I) {
+    Value *Arg = Call.getOperand(I);
+    Type *const OrigArgTy = Arg->getType();
+    Type *const IntrinsicArgTy = WidenedFnTy->getParamType(I);
+    if (OrigArgTy == IntrinsicArgTy) {
+      Args.push_back(Arg); // The arg is passed as is.
+      continue;
+    }
+    Type *const ShadowArgTy = Config.getExtendedFPType(Arg->getType());
+    assert(ShadowArgTy &&
+           "don't know how to get the shadow value for a non-FT");
+    Value *Shadow = Map.getShadow(Arg);
+    if (ShadowArgTy == IntrinsicArgTy) {
+      // The shadow is the right type for the intrinsic.
+      assert(Shadow->getType() == ShadowArgTy);
+      Args.push_back(Shadow);
+      continue;
+    }
+    // There is no intrinsic with his level of precision, truncate the shadow.
+    Args.push_back(
+        Builder.CreateCast(Instruction::FPTrunc, Shadow, IntrinsicArgTy));
+  }
+  Value *IntrinsicCall = Builder.CreateIntrinsic(WidenedId, ArgTys, Args);
+  return WidenedFnTy->getReturnType() == ExtendedVT
+             ? IntrinsicCall
+             : Builder.CreateCast(Instruction::FPExt, IntrinsicCall,
+                                  ExtendedVT);
+}
+
+// Handle a CallBase, i.e. a function call, an inline asm sequence, or an
+// invoke.
+Value *NumericalStabilitySanitizer::handleCallBase(CallBase &Call, Type *VT,
+                                                   Type *ExtendedVT,
+                                                   const TargetLibraryInfo &TLI,
+                                                   const ValueToShadowMap &Map,
+                                                   IRBuilder<> &Builder) {
+  // We cannot look inside inline asm, just expand the result again.
+  if (Call.isInlineAsm()) {
+    return Builder.CreateCast(Instruction::FPExt, &Call, ExtendedVT);
+  }
+
+  // Intrinsics and library functions (e.g. sin, exp) are handled
+  // specifically, because we know their semantics and can do better than
+  // blindly calling them (e.g. compute the sinus in the actual shadow domain).
+  if (Value *V =
+          maybeHandleKnownCallBase(Call, VT, ExtendedVT, TLI, Map, Builder))
+    return V;
+
+  // If the return tag matches that of the called function, read the extended
+  // return value from the shadow ret ptr. Else, just extend the return value.
+  Value *HasShadowRet = Builder.CreateICmpEQ(
+      Builder.CreateLoad(IntptrTy, NsanShadowRetTag, /*isVolatile=*/false),
+      getCalleeAddress(Call, Builder));
+  Value *ShadowRetVal =
+      Builder.CreateLoad(ExtendedVT,
+                         Builder.CreatePointerCast(
+                             Builder.CreateConstGEP2_64(NsanShadowRetPtr, 0, 0),
+                             ExtendedVT->getPointerTo()),
+                         /*isVolatile=*/false);
+  Value *Shadow = Builder.CreateSelect(
+      HasShadowRet, ShadowRetVal,
+      Builder.CreateCast(Instruction::FPExt, &Call, ExtendedVT));
+  ++NumInstrumentedFTCalls;
+  return Shadow;
+  // Note that we do not need to set NsanShadowRetTag to zero as we know that
+  // either the function is not instrumented and it will never set
+  // NsanShadowRetTag; or it is and it will always do so.
+}
+
+// Creates a shadow value for the given FT value. At that point all operands are
+// guaranteed to be available.
+Value *NumericalStabilitySanitizer::createShadowValueWithOperandsAvailable(
+    Instruction &Inst, const TargetLibraryInfo &TLI,
+    const ValueToShadowMap &Map) {
+  Type *const VT = Inst.getType();
+  Type *const ExtendedVT = Config.getExtendedFPType(VT);
+  assert(ExtendedVT != nullptr && "trying to create a shadow for a non-FT");
+
+  if (LoadInst *Load = dyn_cast<LoadInst>(&Inst)) {
+    return handleLoad(*Load, VT, ExtendedVT);
+  }
+  if (CallInst *Call = dyn_cast<CallInst>(&Inst)) {
+    // Insert after the call.
+    BasicBlock::iterator It(Inst);
+    IRBuilder<> Builder(Call->getParent(), ++It);
+    Builder.SetCurrentDebugLocation(Call->getDebugLoc());
+    return handleCallBase(*Call, VT, ExtendedVT, TLI, Map, Builder);
+  }
+  if (InvokeInst *Invoke = dyn_cast<InvokeInst>(&Inst)) {
+    // The Invoke terminates the basic block, create a new basic block in
+    // between the successful invoke and the next block.
+    BasicBlock *InvokeBB = Invoke->getParent();
+    BasicBlock *NextBB = Invoke->getNormalDest();
+    BasicBlock *NewBB =
+        BasicBlock::Create(*Context, "", NextBB->getParent(), NextBB);
+    Inst.replaceSuccessorWith(NextBB, NewBB);
+
+    IRBuilder<> Builder(NewBB);
+    Builder.SetCurrentDebugLocation(Invoke->getDebugLoc());
+    Value *Shadow = handleCallBase(*Invoke, VT, ExtendedVT, TLI, Map, Builder);
+    Builder.CreateBr(NextBB);
+    NewBB->replaceSuccessorsPhiUsesWith(InvokeBB, NewBB);
+    return Shadow;
+  }
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(&Inst)) {
+    IRBuilder<> Builder(getNextInstructionOrDie(*BinOp));
+    Builder.SetCurrentDebugLocation(BinOp->getDebugLoc());
+    return Builder.CreateBinOp(BinOp->getOpcode(),
+                               Map.getShadow(BinOp->getOperand(0)),
+                               Map.getShadow(BinOp->getOperand(1)));
+  }
+  if (UnaryOperator *UnaryOp = dyn_cast<UnaryOperator>(&Inst)) {
+    IRBuilder<> Builder(getNextInstructionOrDie(*UnaryOp));
+    Builder.SetCurrentDebugLocation(UnaryOp->getDebugLoc());
+    return Builder.CreateUnOp(UnaryOp->getOpcode(),
+                              Map.getShadow(UnaryOp->getOperand(0)));
+  }
+  if (FPTruncInst *Trunc = dyn_cast<FPTruncInst>(&Inst)) {
+    return handleTrunc(*Trunc, VT, ExtendedVT, Map);
+  }
+  if (FPExtInst *Ext = dyn_cast<FPExtInst>(&Inst)) {
+    return handleExt(*Ext, VT, ExtendedVT, Map);
+  }
+  if (isa<UIToFPInst>(&Inst) || isa<SIToFPInst>(&Inst)) {
+    CastInst *Cast = dyn_cast<CastInst>(&Inst);
+    IRBuilder<> Builder(getNextInstructionOrDie(*Cast));
+    Builder.SetCurrentDebugLocation(Cast->getDebugLoc());
+    return Builder.CreateCast(Cast->getOpcode(), Cast->getOperand(0),
+                              ExtendedVT);
+  }
+
+  if (SelectInst *S = dyn_cast<SelectInst>(&Inst)) {
+    IRBuilder<> Builder(getNextInstructionOrDie(*S));
+    Builder.SetCurrentDebugLocation(S->getDebugLoc());
+    return Builder.CreateSelect(S->getCondition(),
+                                Map.getShadow(S->getTrueValue()),
+                                Map.getShadow(S->getFalseValue()));
+  }
+
+  if (ExtractElementInst *Extract = dyn_cast<ExtractElementInst>(&Inst)) {
+    IRBuilder<> Builder(getNextInstructionOrDie(*Extract));
+    Builder.SetCurrentDebugLocation(Extract->getDebugLoc());
+    return Builder.CreateExtractElement(
+        Map.getShadow(Extract->getVectorOperand()), Extract->getIndexOperand());
+  }
+
+  if (InsertElementInst *Insert = dyn_cast<InsertElementInst>(&Inst)) {
+    IRBuilder<> Builder(getNextInstructionOrDie(*Insert));
+    Builder.SetCurrentDebugLocation(Insert->getDebugLoc());
+    return Builder.CreateInsertElement(Map.getShadow(Insert->getOperand(0)),
+                                       Map.getShadow(Insert->getOperand(1)),
+                                       Insert->getOperand(2));
+  }
+
+  if (ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(&Inst)) {
+    IRBuilder<> Builder(getNextInstructionOrDie(*Shuffle));
+    Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
+    return Builder.CreateShuffleVector(Map.getShadow(Shuffle->getOperand(0)),
+                                       Map.getShadow(Shuffle->getOperand(1)),
+                                       Shuffle->getShuffleMask());
+  }
+
+  if (ExtractValueInst *Extract = dyn_cast<ExtractValueInst>(&Inst)) {
+    IRBuilder<> Builder(getNextInstructionOrDie(*Extract));
+    Builder.SetCurrentDebugLocation(Extract->getDebugLoc());
+    // FIXME: We could make aggregate object first class citizens. For now we
+    // just extend the extracted value.
+    return Builder.CreateCast(Instruction::FPExt, Extract, ExtendedVT);
+  }
+
+  if (BitCastInst *BC = dyn_cast<BitCastInst>(&Inst)) {
+    IRBuilder<> Builder(getNextInstructionOrDie(*BC));
+    Builder.SetCurrentDebugLocation(BC->getDebugLoc());
+    return Builder.CreateCast(Instruction::FPExt, BC, ExtendedVT);
+  }
+
+  errs() << "FIXME: implement " << Inst.getOpcodeName() << "\n";
+  llvm_unreachable("not implemented");
+}
+
+// Creates a shadow value for an instruction that defines a value of FT type.
+// FT operands that do not already have shadow values are created recursively.
+// The DFS is guaranteed to not loop as phis and arguments already have
+// shadows.
+void NumericalStabilitySanitizer::maybeCreateShadowValue(
+    Instruction &Root, const TargetLibraryInfo &TLI, ValueToShadowMap &Map) {
+  Type *const VT = Root.getType();
+  Type *const ExtendedVT = Config.getExtendedFPType(VT);
+  if (ExtendedVT == nullptr)
+    return; // Not an FT value.
+
+  if (Map.hasShadow(&Root))
+    return; // Shadow already exists.
+
+  assert(!isa<PHINode>(Root) && "phi nodes should already have shadows");
+
+  std::vector<Instruction *> DfsStack(1, &Root);
+  while (!DfsStack.empty()) {
+    // Ensure that all operands to the instruction have shadows before
+    // proceeding.
+    Instruction *I = DfsStack.back();
+    // The shadow for the instruction might have been created deeper in the DFS,
+    // see `forward_use_with_two_uses` test.
+    if (Map.hasShadow(I)) {
+      DfsStack.pop_back();
+      continue;
+    }
+
+    bool MissingShadow = false;
+    for (Value *Op : I->operands()) {
+      Type *const VT = Op->getType();
+      if (!Config.getExtendedFPType(VT))
+        continue; // Not an FT value.
+      if (Map.hasShadow(Op))
+        continue; // Shadow is already available.
+      assert(isa<Instruction>(Op) &&
+             "non-instructions should already have shadows");
+      assert(!isa<PHINode>(Op) && "phi nodes should aready have shadows");
+      MissingShadow = true;
+      DfsStack.push_back(dyn_cast<Instruction>(Op));
+    }
+    if (MissingShadow)
+      continue; // Process operands and come back to this instruction later.
+
+    // All operands have shadows. Create a shadow for the current value.
+    Value *Shadow = createShadowValueWithOperandsAvailable(*I, TLI, Map);
+    Map.setShadow(I, Shadow);
+    DfsStack.pop_back();
+  }
+}
+
+// A floating-point store needs its value and type written to shadow memory.
+void NumericalStabilitySanitizer::propagateFTStore(
+    StoreInst &Store, Type *const VT, Type *const ExtendedVT,
+    const ValueToShadowMap &Map) {
+  Value *StoredValue = Store.getValueOperand();
+  IRBuilder<> Builder(&Store);
+  Builder.SetCurrentDebugLocation(Store.getDebugLoc());
+  const auto Extents = getMemoryExtentsOrDie(VT);
+  Value *ShadowPtr =
+      Builder.CreateCall(NsanGetShadowPtrForStore[Extents.ValueType],
+                         {Builder.CreatePointerCast(Store.getPointerOperand(),
+                                                    Builder.getInt8PtrTy()),
+                          ConstantInt::get(IntptrTy, Extents.NumElts)});
+
+  Value *StoredShadow = Map.getShadow(StoredValue);
+  if (!Store.getParent()->getParent()->hasOptNone()) {
+    // Only check stores when optimizing, because non-optimized code generates
+    // too many stores to the stack, creating false positives.
+    StoredShadow = emitCheck(StoredValue, StoredShadow, Builder,
+                             CheckLoc::makeStore(Store.getPointerOperand()));
+    ++NumInstrumentedFTStores;
+  }
+
+  Builder.CreateAlignedStore(
+      StoredShadow,
+      Builder.CreatePointerCast(ShadowPtr, ExtendedVT->getPointerTo()),
+      Align(1), Store.isVolatile());
+}
+
+// A non-ft store needs to invalidate shadow memory. Exceptions are:
+//   - memory transfers of floating-point data through other pointer types (llvm
+//     optimization passes transform `*(float*)a = *(float*)b` into
+//     `*(i32*)a = *(i32*)b` ). These have the same semantics as memcpy.
+//   - Writes of FT-sized constants. LLVM likes to do float stores as bitcasted
+//     ints. Note that this is not really necessary because if the value is
+//     unknown the framework will re-extend it on load anyway. It just felt
+//     easier to debug tests with vectors of FTs.
+void NumericalStabilitySanitizer::propagateNonFTStore(
+    StoreInst &Store, Type *const VT, const ValueToShadowMap &Map) {
+  Value *PtrOp = Store.getPointerOperand();
+  IRBuilder<> Builder(getNextInstructionOrDie(Store));
+  Builder.SetCurrentDebugLocation(Store.getDebugLoc());
+  Value *Dst = Builder.CreatePointerCast(PtrOp, Builder.getInt8PtrTy());
+  const DataLayout &DL =
+      Store.getParent()->getParent()->getParent()->getDataLayout();
+  TypeSize SlotSize = DL.getTypeStoreSize(VT);
+  assert(!SlotSize.isScalable() && "unsupported");
+  const auto LoadSizeBytes = SlotSize.getFixedSize();
+  Value *ValueSize = Builder.Insert(Constant::getIntegerValue(
+      IntptrTy, APInt(IntptrTy->getPrimitiveSizeInBits(), LoadSizeBytes)));
+
+  ++NumInstrumentedNonFTStores;
+  Value *StoredValue = Store.getValueOperand();
+  if (LoadInst *Load = dyn_cast<LoadInst>(StoredValue)) {
+    // FIXME: Handle the case when the value is from a phi.
+    // This is a memory transfer with memcpy semantics. Copy the type and
+    // value from the source. Note that we cannot use __nsan_copy_values()
+    // here, because that will not work when there is a write to memory in
+    // between the load and the store, e.g. in the case of a swap.
+    Type *ShadowTypeIntTy = Type::getIntNTy(*Context, 8 * LoadSizeBytes);
+    Type *ShadowValueIntTy =
+        Type::getIntNTy(*Context, 8 * kShadowScale * LoadSizeBytes);
+    IRBuilder<> LoadBuilder(getNextInstructionOrDie(*Load));
+    Builder.SetCurrentDebugLocation(Store.getDebugLoc());
+    Value *LoadSrc = LoadBuilder.CreatePointerCast(Load->getPointerOperand(),
+                                                   Builder.getInt8PtrTy());
+    // Read the shadow type and value at load time. The type has the same size
+    // as the FT value, the value has twice its size.
+    // FIXME: cache them to avoid re-creating them when a load is used by
+    // several stores. Maybe create them like the FT shadows when a load is
+    // encountered.
+    Value *RawShadowType = LoadBuilder.CreateAlignedLoad(
+        ShadowTypeIntTy,
+        LoadBuilder.CreatePointerCast(
+            LoadBuilder.CreateCall(NsanGetRawShadowTypePtr, {LoadSrc}),
+            ShadowTypeIntTy->getPointerTo()),
+        Align(1),
+        /*isVolatile=*/false);
+    Value *RawShadowValue = LoadBuilder.CreateAlignedLoad(
+        ShadowValueIntTy,
+        LoadBuilder.CreatePointerCast(
+            LoadBuilder.CreateCall(NsanGetRawShadowPtr, {LoadSrc}),
+            ShadowValueIntTy->getPointerTo()),
+        Align(1),
+        /*isVolatile=*/false);
+
+    // Write back the shadow type and value at store time.
+    Builder.CreateAlignedStore(
+        RawShadowType,
+        Builder.CreatePointerCast(
+            Builder.CreateCall(NsanGetRawShadowTypePtr, {Dst}),
+            ShadowTypeIntTy->getPointerTo()),
+        Align(1),
+        /*isVolatile=*/false);
+    Builder.CreateAlignedStore(
+        RawShadowValue,
+        Builder.CreatePointerCast(
+            Builder.CreateCall(NsanGetRawShadowPtr, {Dst}),
+            ShadowValueIntTy->getPointerTo()),
+        Align(1),
+        /*isVolatile=*/false);
+
+    ++NumInstrumentedNonFTMemcpyStores;
+    return;
+  }
+  if (Constant *C = dyn_cast<Constant>(StoredValue)) {
+    // This might be a fp constant stored as an int. Bitcast and store if it has
+    // appropriate size.
+    Type *BitcastTy = nullptr; // The FT type to bitcast to.
+    if (ConstantInt *CInt = dyn_cast<ConstantInt>(C)) {
+      switch (CInt->getType()->getScalarSizeInBits()) {
+      case 32:
+        BitcastTy = Type::getFloatTy(*Context);
+        break;
+      case 64:
+        BitcastTy = Type::getDoubleTy(*Context);
+        break;
+      case 80:
+        BitcastTy = Type::getX86_FP80Ty(*Context);
+        break;
+      default:
+        break;
+      }
+    } else if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C)) {
+      const int NumElements =
+          cast<VectorType>(CDV->getType())->getElementCount().getFixedValue();
+      switch (CDV->getType()->getScalarSizeInBits()) {
+      case 32:
+        BitcastTy =
+            VectorType::get(Type::getFloatTy(*Context), NumElements, false);
+        break;
+      case 64:
+        BitcastTy =
+            VectorType::get(Type::getDoubleTy(*Context), NumElements, false);
+        break;
+      case 80:
+        BitcastTy =
+            VectorType::get(Type::getX86_FP80Ty(*Context), NumElements, false);
+        break;
+      default:
+        break;
+      }
+    }
+    if (BitcastTy) {
+      const MemoryExtents Extents = getMemoryExtentsOrDie(BitcastTy);
+      Value *ShadowPtr = Builder.CreateCall(
+          NsanGetShadowPtrForStore[Extents.ValueType],
+          {Builder.CreatePointerCast(PtrOp, Builder.getInt8PtrTy()),
+           ConstantInt::get(IntptrTy, Extents.NumElts)});
+      // Bitcast the integer value to the appropriate FT type and extend to 2FT.
+      Type *ExtVT = Config.getExtendedFPType(BitcastTy);
+      Value *Shadow = Builder.CreateCast(
+          Instruction::FPExt, Builder.CreateBitCast(C, BitcastTy), ExtVT);
+      Builder.CreateAlignedStore(
+          Shadow, Builder.CreatePointerCast(ShadowPtr, ExtVT->getPointerTo()),
+          Align(1), Store.isVolatile());
+      return;
+    }
+  }
+  // All other stores just reset the shadow value to unknown.
+  Builder.CreateCall(NsanSetValueUnknown, {Dst, ValueSize});
+}
+
+void NumericalStabilitySanitizer::propagateShadowValues(
+    Instruction &Inst, const TargetLibraryInfo &TLI,
+    const ValueToShadowMap &Map) {
+  if (StoreInst *Store = dyn_cast<StoreInst>(&Inst)) {
+    Value *StoredValue = Store->getValueOperand();
+    Type *const VT = StoredValue->getType();
+    Type *const ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      return propagateNonFTStore(*Store, VT, Map);
+    return propagateFTStore(*Store, VT, ExtendedVT, Map);
+  }
+
+  if (FCmpInst *FCmp = dyn_cast<FCmpInst>(&Inst)) {
+    emitFCmpCheck(*FCmp, Map);
+    return;
+  }
+
+  if (CallBase *CB = dyn_cast<CallBase>(&Inst)) {
+    maybeAddSuffixForNsanInterface(CB);
+    if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+      maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) {
+      instrumentMemIntrinsic(MI);
+      return;
+    }
+    populateShadowStack(*CB, TLI, Map);
+    return;
+  }
+
+  if (ReturnInst *RetInst = dyn_cast<ReturnInst>(&Inst)) {
+    Value *RV = RetInst->getReturnValue();
+    if (RV == nullptr)
+      return; // This is a `ret void`.
+    Type *const VT = RV->getType();
+    Type *const ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      return; // Not an FT ret.
+    Value *RVShadow = Map.getShadow(RV);
+    IRBuilder<> Builder(&Inst);
+    Builder.SetCurrentDebugLocation(RetInst->getDebugLoc());
+    RVShadow = emitCheck(RV, RVShadow, Builder, CheckLoc::makeRet());
+    ++NumInstrumentedFTRets;
+    // Store tag.
+    Value *FnAddr =
+        Builder.CreatePtrToInt(Inst.getParent()->getParent(), IntptrTy);
+    Builder.CreateStore(FnAddr, NsanShadowRetTag);
+    // Store value.
+    Value *ShadowRetValPtr = Builder.CreatePointerCast(
+        Builder.CreateConstGEP2_64(NsanShadowRetPtr, 0, 0),
+        ExtendedVT->getPointerTo());
+    Builder.CreateStore(RVShadow, ShadowRetValPtr);
+    return;
+  }
+
+  if (InsertValueInst *Insert = dyn_cast<InsertValueInst>(&Inst)) {
+    Value *V = Insert->getOperand(1);
+    Type *const VT = V->getType();
+    Type *const ExtendedVT = Config.getExtendedFPType(VT);
+    if (ExtendedVT == nullptr)
+      return;
+    IRBuilder<> Builder(Insert);
+    Builder.SetCurrentDebugLocation(Insert->getDebugLoc());
+    emitCheck(V, Map.getShadow(V), Builder, CheckLoc::makeInsert());
+    return;
+  }
+}
+
+// Moves fast math flags from the function to individual instructions, and
+// removes the attribute from the function.
+// FIXME: Make this controllable with a flag.
+static void moveFastMathFlags(Function &F,
+                              std::vector<Instruction *> &Instructions) {
+  FastMathFlags FMF;
+#define MOVE_FLAG(attr, setter)                                                \
+  if (F.getFnAttribute(attr).getValueAsString() == "true") {                   \
+    F.removeAttribute(AttributeList::FunctionIndex, attr);                     \
+    FMF.set##setter();                                                         \
+  }
+  MOVE_FLAG("unsafe-fp-math", Fast)
+  MOVE_FLAG("no-infs-fp-math", NoInfs)
+  MOVE_FLAG("no-nans-fp-math", NoNaNs)
+  MOVE_FLAG("no-signed-zeros-fp-math", NoSignedZeros)
+#undef MOVE_FLAG
+
+  for (Instruction *I : Instructions)
+    if (isa<FPMathOperator>(I))
+      I->setFastMathFlags(FMF);
+}
+
+bool NumericalStabilitySanitizer::sanitizeFunction(
+    Function &F, const TargetLibraryInfo &TLI) {
+  // This is required to prevent instrumenting call to __nsan_init from within
+  // the module constructor.
+  if (F.getName() == kNsanModuleCtorName)
+    return false;
+  if (!Config.initialize(&F.getParent()->getContext()))
+    return false;
+  initialize(*F.getParent());
+  SmallVector<Instruction *, 8> AllLoadsAndStores;
+  SmallVector<Instruction *, 8> LocalLoadsAndStores;
+  if (!F.hasFnAttribute(Attribute::SanitizeNumericalStability))
+    return false;
+
+  // The instrumentation maintains:
+  //  - for each IR value `v` of floating-point (or vector floating-point) type
+  //    FT, a shadow IR value `s(v)` with twice the precision 2FT (e.g.
+  //    double for float and f128 for double).
+  //  - A shadow memory, which stores `s(v)` for any `v` that has been stored,
+  //    along with a shadow memory tag, which stores whether the value in the
+  //    corresponding shadow memory is valid. Note that this might be
+  //    incorrect if a non-instrumented function stores to memory, or if
+  //    memory is stored to through a char pointer.
+  //  - A shadow stack, which holds `s(v)` for any floating-point argument `v`
+  //    of a call to an instrumented function. This allows
+  //    instrumented functions to retrieve the shadow values for their
+  //    arguments.
+  //    Because instrumented functions can be called from non-instrumented
+  //    functions, the stack needs to include a tag so that the instrumented
+  //    function knows whether shadow values are available for their
+  //    parameters (i.e. whether is was called by an instrumented function).
+  //    When shadow arguments are not available, they have to be recreated by
+  //    extending the precision of the non-shadow arguments to the non-shadow
+  //    value. Non-instrumented functions do not modify (or even know about) the
+  //    shadow stack. The shadow stack pointer is __nsan_shadow_args. The shadow
+  //    stack tag is __nsan_shadow_args_tag. The tag is any unique identifier
+  //    for the function (we use the address of the function). Both variables
+  //    are thread local.
+  //    Example:
+  //     calls                             shadow stack tag      shadow stack
+  //     =======================================================================
+  //     non_instrumented_1()              0                     0
+  //             |
+  //             v
+  //     instrumented_2(float a)           0                     0
+  //             |
+  //             v
+  //     instrumented_3(float b, double c) &instrumented_3       s(b),s(c)
+  //             |
+  //             v
+  //     instrumented_4(float d)           &instrumented_4       s(d)
+  //             |
+  //             v
+  //     non_instrumented_5(float e)       &non_instrumented_5   s(e)
+  //             |
+  //             v
+  //     instrumented_6(float f)           &non_instrumented_5   s(e)
+  //
+  //   On entry, instrumented_2 checks whether the tag corresponds to its
+  //   function ptr.
+  //   Note that functions reset the tag to 0 after reading shadow parameters.
+  //   This ensures that the function does not erroneously read invalid data if
+  //   called twice in the same stack, once from an instrumented function and
+  //   once from an uninstrumented one. For example, in the following example,
+  //   resetting the tag in (A) ensures that (B) does not reuse the same the
+  //   shadow arguments (which would be incorrect).
+  //      instrumented_1(float a)
+  //             |
+  //             v
+  //      instrumented_2(float b)  (A)
+  //             |
+  //             v
+  //      non_instrumented_3()
+  //             |
+  //             v
+  //      instrumented_2(float b)  (B)
+  //
+  //  - A shadow return slot. Any function that returns a floating-point value
+  //    places a shadow return value in __nsan_shadow_ret_val. Again, because
+  //    we might be calling non-instrumented functions, this value is guarded
+  //    by __nsan_shadow_ret_tag marker indicating which instrumented function
+  //    placed the value in __nsan_shadow_ret_val, so that the caller can check
+  //    that this corresponds to the callee. Both variables are thread local.
+  //
+  //    For example, in the following example, the instrumentation in
+  //    `instrumented_1` rejects the shadow return value from `instrumented_3`
+  //    because is is not tagged as expected (`&instrumented_3` instead of
+  //    `non_instrumented_2`):
+  //
+  //        instrumented_1()
+  //            |
+  //            v
+  //        float non_instrumented_2()
+  //            |
+  //            v
+  //        float instrumented_3()
+  //
+  // Calls of known math functions (sin, cos, exp, ...) are duplicated to call
+  // their overload on the shadow type.
+
+  // Collect all instructions before processing, as creating shadow values
+  // creates new instructions inside the function.
+  std::vector<Instruction *> OriginalInstructions;
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      OriginalInstructions.emplace_back(&Inst);
+    }
+  }
+
+  moveFastMathFlags(F, OriginalInstructions);
+  ValueToShadowMap ValueToShadow(&Config);
+
+  // In the first pass, we create shadow values for all FT function arguments
+  // and all phis. This ensures that the DFS of the next pass does not have
+  // any loops.
+  std::vector<PHINode *> OriginalPhis;
+  createShadowArguments(F, TLI, ValueToShadow);
+  for (Instruction *I : OriginalInstructions) {
+    if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+      if (PHINode *Shadow = maybeCreateShadowPhi(*Phi, TLI)) {
+        OriginalPhis.push_back(Phi);
+        ValueToShadow.setShadow(Phi, Shadow);
+      }
+    }
+  }
+
+  // Create shadow values for all instructions creating FT values.
+  for (Instruction *I : OriginalInstructions) {
+    maybeCreateShadowValue(*I, TLI, ValueToShadow);
+  }
+
+  // Propagate shadow values across stores, calls and rets.
+  for (Instruction *I : OriginalInstructions) {
+    propagateShadowValues(*I, TLI, ValueToShadow);
+  }
+
+  // The last pass populates shadow phis with shadow values.
+  for (PHINode *Phi : OriginalPhis) {
+    PHINode *ShadowPhi = dyn_cast<PHINode>(ValueToShadow.getShadow(Phi));
+    for (int I = 0, E = Phi->getNumOperands(); I < E; ++I) {
+      Value *V = Phi->getOperand(I);
+      Value *Shadow = ValueToShadow.getShadow(V);
+      BasicBlock *IncomingBB = Phi->getIncomingBlock(I);
+      // For some instructions (e.g. invoke), we create the shadow in a separate
+      // block, different from the block where the original value is created.
+      // In that case, the shadow phi might need to refer to this block instead
+      // of the original block.
+      // Note that this can only happen for instructions as constant shadows are
+      // always created in the same block.
+      ShadowPhi->addIncoming(Shadow, IncomingBB);
+    }
+  }
+
+  return !ValueToShadow.empty();
+}
+
+// Instrument the memory intrinsics so that they properly modify the shadow
+// memory.
+bool NumericalStabilitySanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> Builder(MI);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(MI)) {
+    Builder.SetCurrentDebugLocation(M->getDebugLoc());
+    Builder.CreateCall(
+        NsanSetValueUnknown,
+        {// Address
+         Builder.CreatePointerCast(M->getArgOperand(0), Builder.getInt8PtrTy()),
+         // Size
+         Builder.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(MI)) {
+    Builder.SetCurrentDebugLocation(M->getDebugLoc());
+    Builder.CreateCall(
+        NsanCopyValues,
+        {// Destination
+         Builder.CreatePointerCast(M->getArgOperand(0), Builder.getInt8PtrTy()),
+         // Source
+         Builder.CreatePointerCast(M->getArgOperand(1), Builder.getInt8PtrTy()),
+         // Size
+         Builder.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+  }
+  return false;
+}
+
+void NumericalStabilitySanitizer::maybeAddSuffixForNsanInterface(CallBase *CI) {
+  Function *Fn = CI->getCalledFunction();
+  if (Fn == nullptr)
+    return;
+
+  if (!Fn->getName().startswith("__nsan_"))
+    return;
+
+  if (Fn->getName() == "__nsan_dump_shadow_mem") {
+    assert(CI->getNumArgOperands() == 4 &&
+           "invalid prototype for __nsan_dump_shadow_mem");
+    // __nsan_dump_shadow_mem requires an extra parameter with the dynamic
+    // configuration:
+    // (shadow_type_id_for_long_double << 16) | (shadow_type_id_for_double << 8)
+    // | shadow_type_id_for_double
+    const uint64_t shadow_value_type_ids =
+        (static_cast<size_t>(Config.byValueType(kLongDouble).getNsanTypeId())
+         << 16) |
+        (static_cast<size_t>(Config.byValueType(kDouble).getNsanTypeId())
+         << 8) |
+        static_cast<size_t>(Config.byValueType(kFloat).getNsanTypeId());
+    CI->setArgOperand(3, ConstantInt::get(IntptrTy, shadow_value_type_ids));
+  }
+}
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -961,6 +961,7 @@
       case Attribute::ShadowCallStack:
       case Attribute::SanitizeAddress:
       case Attribute::SanitizeMemory:
+      case Attribute::SanitizeNumericalStability:
       case Attribute::SanitizeThread:
       case Attribute::SanitizeHWAddress:
       case Attribute::SanitizeMemTag:
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -0,0 +1,965 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -nsan -nsan-shadow-type-mapping=dqq -nsan-truncate-fcmp-eq=false -S | FileCheck %s --check-prefixes=CHECK,DQQ
+; RUN: opt < %s -nsan -nsan-shadow-type-mapping=dlq -nsan-truncate-fcmp-eq=false -S | FileCheck %s --check-prefixes=CHECK,DLQ
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests with simple control flow.
+
+@float_const = private unnamed_addr constant float 0.5
+@x86_fp80_const = private unnamed_addr constant x86_fp80 0xK3FC9E69594BEC44DE000
+@double_const = private unnamed_addr constant double 0.5
+
+
+define float @return_param_float(float %a) sanitize_numericalstability {
+; CHECK-LABEL: @return_param_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (float (float)* @return_param_float to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__nsan_internal_check_float_d(float [[A]], double [[TMP4]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext float [[A]] to double
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], double [[TMP7]], double [[TMP4]]
+; CHECK-NEXT:    store i64 ptrtoint (float (float)* @return_param_float to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP8]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[A]]
+;
+entry:
+  ret float %a
+}
+
+; Note that the shadow fadd should not have a `fast` flag.
+define float @param_add_return_float(float %a) sanitize_numericalstability {
+; CHECK-LABEL: @param_add_return_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (float (float)* @param_add_return_float to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[B:%.*]] = fadd fast float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP5]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], double [[TMP8]], double [[TMP5]]
+; CHECK-NEXT:    store i64 ptrtoint (float (float)* @param_add_return_float to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP9]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[B]]
+;
+entry:
+  %b = fadd fast float %a, 1.0
+  ret float %b
+}
+
+define x86_fp80 @param_add_return_x86_fp80(x86_fp80 %a) sanitize_numericalstability {
+; CHECK-LABEL: @param_add_return_x86_fp80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (x86_fp80 (x86_fp80)* @param_add_return_x86_fp80 to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load fp128, fp128* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to fp128*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext x86_fp80 [[A:%.*]] to fp128
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], fp128 [[TMP2]], fp128 [[TMP3]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[B:%.*]] = fadd x86_fp80 [[A]], 0xK3FC9E69594BEC44DE000
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fp128 [[TMP4]], 0xLC0000000000000003FC9CD2B297D889B
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[B]], fp128 [[TMP5]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext x86_fp80 [[B]] to fp128
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], fp128 [[TMP8]], fp128 [[TMP5]]
+; CHECK-NEXT:    store i64 ptrtoint (x86_fp80 (x86_fp80)* @param_add_return_x86_fp80 to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP9]], fp128* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to fp128*), align 16
+; CHECK-NEXT:    ret x86_fp80 [[B]]
+;
+entry:
+  %b = fadd x86_fp80 %a, 0xK3FC9E69594BEC44DE000
+  ret x86_fp80 %b
+}
+
+define double @param_add_return_double(double %a) sanitize_numericalstability {
+; DQQ-LABEL: @param_add_return_double(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (double (double)* @param_add_return_double to i64)
+; DQQ-NEXT:    [[TMP2:%.*]] = load fp128, fp128* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to fp128*), align 1
+; DQQ-NEXT:    [[TMP3:%.*]] = fpext double [[A:%.*]] to fp128
+; DQQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], fp128 [[TMP2]], fp128 [[TMP3]]
+; DQQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[B:%.*]] = fadd double [[A]], 1.000000e+00
+; DQQ-NEXT:    [[TMP5:%.*]] = fadd fp128 [[TMP4]], 0xL00000000000000003FFF000000000000
+; DQQ-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_double_q(double [[B]], fp128 [[TMP5]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; DQQ-NEXT:    [[TMP8:%.*]] = fpext double [[B]] to fp128
+; DQQ-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], fp128 [[TMP8]], fp128 [[TMP5]]
+; DQQ-NEXT:    store i64 ptrtoint (double (double)* @param_add_return_double to i64), i64* @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP9]], fp128* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to fp128*), align 16
+; DQQ-NEXT:    ret double [[B]]
+;
+; DLQ-LABEL: @param_add_return_double(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (double (double)* @param_add_return_double to i64)
+; DLQ-NEXT:    [[TMP2:%.*]] = load x86_fp80, x86_fp80* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to x86_fp80*), align 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[A:%.*]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], x86_fp80 [[TMP2]], x86_fp80 [[TMP3]]
+; DLQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[B:%.*]] = fadd double [[A]], 1.000000e+00
+; DLQ-NEXT:    [[TMP5:%.*]] = fadd x86_fp80 [[TMP4]], 0xK3FFF8000000000000000
+; DLQ-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_double_l(double [[B]], x86_fp80 [[TMP5]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; DLQ-NEXT:    [[TMP8:%.*]] = fpext double [[B]] to x86_fp80
+; DLQ-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], x86_fp80 [[TMP8]], x86_fp80 [[TMP5]]
+; DLQ-NEXT:    store i64 ptrtoint (double (double)* @param_add_return_double to i64), i64* @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP9]], x86_fp80* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to x86_fp80*), align 16
+; DLQ-NEXT:    ret double [[B]]
+;
+entry:
+  %b = fadd double %a, 1.0
+  ret double %b
+}
+
+define <2 x float> @return_param_add_return_float_vector(<2 x float> %a) sanitize_numericalstability {
+; CHECK-LABEL: @return_param_add_return_float_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (<2 x float> (<2 x float>)* @return_param_add_return_float_vector to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to <2 x double>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[A:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[B:%.*]] = fadd <2 x float> [[A]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[B]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP6]], double [[TMP7]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[B]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP9]], double [[TMP10]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = or i32 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = fpext <2 x float> [[B]] to <2 x double>
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP5]]
+; CHECK-NEXT:    store i64 ptrtoint (<2 x float> (<2 x float>)* @return_param_add_return_float_vector to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store <2 x double> [[TMP15]], <2 x double>* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to <2 x double>*), align 16
+; CHECK-NEXT:    ret <2 x float> [[B]]
+;
+entry:
+  %b = fadd <2 x float> %a, <float 1.0, float 1.0>
+  ret <2 x float> %b
+}
+
+; TODO: This is ignored for now.
+define [2 x float] @return_param_float_array([2 x float] %a) sanitize_numericalstability {
+; CHECK-LABEL: @return_param_float_array(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret [2 x float] [[A:%.*]]
+;
+entry:
+  ret [2 x float] %a
+}
+
+define void @constantload_add_store_float(float* %dst) sanitize_numericalstability {
+; CHECK-LABEL: @constantload_add_store_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load float, float* @float_const
+; CHECK-NEXT:    [[TMP0:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[C:%.*]] = fadd float [[B]], 1.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd double [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP2]], i64 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint float* [[DST]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__nsan_internal_check_float_d(float [[C]], double [[TMP1]], i32 4, i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], double [[TMP7]], double [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP3]] to double*
+; CHECK-NEXT:    store double [[TMP8]], double* [[TMP9]], align 1
+; CHECK-NEXT:    store float [[C]], float* [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load float, float* @float_const
+  %c = fadd float %b, 1.0
+  store float %c, float* %dst, align 1
+  ret void
+}
+
+define void @constantload_add_store_x86_fp80(x86_fp80* %dst) sanitize_numericalstability {
+; CHECK-LABEL: @constantload_add_store_x86_fp80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load x86_fp80, x86_fp80* @x86_fp80_const
+; CHECK-NEXT:    [[TMP0:%.*]] = fpext x86_fp80 [[B]] to fp128
+; CHECK-NEXT:    [[C:%.*]] = fadd x86_fp80 [[B]], 0xK3FC9E69594BEC44DE000
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fp128 [[TMP0]], 0xLC0000000000000003FC9CD2B297D889B
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_fp80* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = call i8* @__nsan_get_shadow_ptr_for_longdouble_store(i8* [[TMP2]], i64 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint x86_fp80* [[DST]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[C]], fp128 [[TMP1]], i32 4, i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext x86_fp80 [[C]] to fp128
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], fp128 [[TMP7]], fp128 [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP3]] to fp128*
+; CHECK-NEXT:    store fp128 [[TMP8]], fp128* [[TMP9]], align 1
+; CHECK-NEXT:    store x86_fp80 [[C]], x86_fp80* [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load x86_fp80, x86_fp80* @x86_fp80_const
+  %c = fadd x86_fp80 %b, 0xK3FC9E69594BEC44DE000
+  store x86_fp80 %c, x86_fp80* %dst, align 1
+  ret void
+}
+
+define void @constantload_add_store_double(double* %dst) sanitize_numericalstability {
+; DQQ-LABEL: @constantload_add_store_double(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[B:%.*]] = load double, double* @double_const
+; DQQ-NEXT:    [[TMP0:%.*]] = fpext double [[B]] to fp128
+; DQQ-NEXT:    [[C:%.*]] = fadd double [[B]], 1.000000e+00
+; DQQ-NEXT:    [[TMP1:%.*]] = fadd fp128 [[TMP0]], 0xL00000000000000003FFF000000000000
+; DQQ-NEXT:    [[TMP2:%.*]] = bitcast double* [[DST:%.*]] to i8*
+; DQQ-NEXT:    [[TMP3:%.*]] = call i8* @__nsan_get_shadow_ptr_for_double_store(i8* [[TMP2]], i64 1)
+; DQQ-NEXT:    [[TMP4:%.*]] = ptrtoint double* [[DST]] to i64
+; DQQ-NEXT:    [[TMP5:%.*]] = call i32 @__nsan_internal_check_double_q(double [[C]], fp128 [[TMP1]], i32 4, i64 [[TMP4]])
+; DQQ-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+; DQQ-NEXT:    [[TMP7:%.*]] = fpext double [[C]] to fp128
+; DQQ-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], fp128 [[TMP7]], fp128 [[TMP1]]
+; DQQ-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP3]] to fp128*
+; DQQ-NEXT:    store fp128 [[TMP8]], fp128* [[TMP9]], align 1
+; DQQ-NEXT:    store double [[C]], double* [[DST]], align 1
+; DQQ-NEXT:    ret void
+;
+; DLQ-LABEL: @constantload_add_store_double(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[B:%.*]] = load double, double* @double_const
+; DLQ-NEXT:    [[TMP0:%.*]] = fpext double [[B]] to x86_fp80
+; DLQ-NEXT:    [[C:%.*]] = fadd double [[B]], 1.000000e+00
+; DLQ-NEXT:    [[TMP1:%.*]] = fadd x86_fp80 [[TMP0]], 0xK3FFF8000000000000000
+; DLQ-NEXT:    [[TMP2:%.*]] = bitcast double* [[DST:%.*]] to i8*
+; DLQ-NEXT:    [[TMP3:%.*]] = call i8* @__nsan_get_shadow_ptr_for_double_store(i8* [[TMP2]], i64 1)
+; DLQ-NEXT:    [[TMP4:%.*]] = ptrtoint double* [[DST]] to i64
+; DLQ-NEXT:    [[TMP5:%.*]] = call i32 @__nsan_internal_check_double_l(double [[C]], x86_fp80 [[TMP1]], i32 4, i64 [[TMP4]])
+; DLQ-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+; DLQ-NEXT:    [[TMP7:%.*]] = fpext double [[C]] to x86_fp80
+; DLQ-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], x86_fp80 [[TMP7]], x86_fp80 [[TMP1]]
+; DLQ-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP3]] to x86_fp80*
+; DLQ-NEXT:    store x86_fp80 [[TMP8]], x86_fp80* [[TMP9]], align 1
+; DLQ-NEXT:    store double [[C]], double* [[DST]], align 1
+; DLQ-NEXT:    ret void
+;
+entry:
+  %b = load double, double* @double_const
+  %c = fadd double %b, 1.0
+  store double %c, double* %dst, align 1
+  ret void
+}
+
+define void @load_add_store_float(float* %a) sanitize_numericalstability {
+; CHECK-LABEL: @load_add_store_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load float, float* [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP0]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP1]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP6:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to double*
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[TMP4]], align 1
+; CHECK-NEXT:    br label [[TMP8:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi double [ [[TMP5]], [[TMP3]] ], [ [[TMP7]], [[TMP6]] ]
+; CHECK-NEXT:    [[C:%.*]] = fadd float [[B]], 1.000000e+00
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd double [[TMP9]], 1.000000e+00
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[A]] to i8*
+; CHECK-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP11]], i64 1)
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint float* [[A]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @__nsan_internal_check_float_d(float [[C]], double [[TMP10]], i32 4, i64 [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], double [[TMP16]], double [[TMP10]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP12]] to double*
+; CHECK-NEXT:    store double [[TMP17]], double* [[TMP18]], align 1
+; CHECK-NEXT:    store float [[C]], float* [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load float, float* %a, align 1
+  %c = fadd float %b, 1.0
+  store float %c, float* %a, align 1
+  ret void
+}
+
+define void @load_add_store_x86_fp80(x86_fp80* %a) sanitize_numericalstability {
+; CHECK-LABEL: @load_add_store_x86_fp80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load x86_fp80, x86_fp80* [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_fp80* [[A]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @__nsan_get_shadow_ptr_for_longdouble_load(i8* [[TMP0]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP1]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP6:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to fp128*
+; CHECK-NEXT:    [[TMP5:%.*]] = load fp128, fp128* [[TMP4]], align 1
+; CHECK-NEXT:    br label [[TMP8:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext x86_fp80 [[B]] to fp128
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi fp128 [ [[TMP5]], [[TMP3]] ], [ [[TMP7]], [[TMP6]] ]
+; CHECK-NEXT:    [[C:%.*]] = fadd x86_fp80 [[B]], 0xK3FC9E69594BEC44DE000
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd fp128 [[TMP9]], 0xLC0000000000000003FC9CD2B297D889B
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast x86_fp80* [[A]] to i8*
+; CHECK-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_get_shadow_ptr_for_longdouble_store(i8* [[TMP11]], i64 1)
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint x86_fp80* [[A]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[C]], fp128 [[TMP10]], i32 4, i64 [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = fpext x86_fp80 [[C]] to fp128
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], fp128 [[TMP16]], fp128 [[TMP10]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP12]] to fp128*
+; CHECK-NEXT:    store fp128 [[TMP17]], fp128* [[TMP18]], align 1
+; CHECK-NEXT:    store x86_fp80 [[C]], x86_fp80* [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load x86_fp80, x86_fp80* %a, align 1
+  %c = fadd x86_fp80 %b, 0xK3FC9E69594BEC44DE000
+  store x86_fp80 %c, x86_fp80* %a, align 1
+  ret void
+}
+
+define void @load_add_store_double(double* %a) sanitize_numericalstability {
+; DQQ-LABEL: @load_add_store_double(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[B:%.*]] = load double, double* [[A:%.*]], align 1
+; DQQ-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to i8*
+; DQQ-NEXT:    [[TMP1:%.*]] = call i8* @__nsan_get_shadow_ptr_for_double_load(i8* [[TMP0]], i64 1)
+; DQQ-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP1]], null
+; DQQ-NEXT:    br i1 [[TMP2]], label [[TMP6:%.*]], label [[TMP3:%.*]]
+; DQQ:       3:
+; DQQ-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to fp128*
+; DQQ-NEXT:    [[TMP5:%.*]] = load fp128, fp128* [[TMP4]], align 1
+; DQQ-NEXT:    br label [[TMP8:%.*]]
+; DQQ:       6:
+; DQQ-NEXT:    [[TMP7:%.*]] = fpext double [[B]] to fp128
+; DQQ-NEXT:    br label [[TMP8]]
+; DQQ:       8:
+; DQQ-NEXT:    [[TMP9:%.*]] = phi fp128 [ [[TMP5]], [[TMP3]] ], [ [[TMP7]], [[TMP6]] ]
+; DQQ-NEXT:    [[C:%.*]] = fadd double [[B]], 1.000000e+00
+; DQQ-NEXT:    [[TMP10:%.*]] = fadd fp128 [[TMP9]], 0xL00000000000000003FFF000000000000
+; DQQ-NEXT:    [[TMP11:%.*]] = bitcast double* [[A]] to i8*
+; DQQ-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_get_shadow_ptr_for_double_store(i8* [[TMP11]], i64 1)
+; DQQ-NEXT:    [[TMP13:%.*]] = ptrtoint double* [[A]] to i64
+; DQQ-NEXT:    [[TMP14:%.*]] = call i32 @__nsan_internal_check_double_q(double [[C]], fp128 [[TMP10]], i32 4, i64 [[TMP13]])
+; DQQ-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1
+; DQQ-NEXT:    [[TMP16:%.*]] = fpext double [[C]] to fp128
+; DQQ-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], fp128 [[TMP16]], fp128 [[TMP10]]
+; DQQ-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP12]] to fp128*
+; DQQ-NEXT:    store fp128 [[TMP17]], fp128* [[TMP18]], align 1
+; DQQ-NEXT:    store double [[C]], double* [[A]], align 1
+; DQQ-NEXT:    ret void
+;
+; DLQ-LABEL: @load_add_store_double(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[B:%.*]] = load double, double* [[A:%.*]], align 1
+; DLQ-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to i8*
+; DLQ-NEXT:    [[TMP1:%.*]] = call i8* @__nsan_get_shadow_ptr_for_double_load(i8* [[TMP0]], i64 1)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP1]], null
+; DLQ-NEXT:    br i1 [[TMP2]], label [[TMP6:%.*]], label [[TMP3:%.*]]
+; DLQ:       3:
+; DLQ-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to x86_fp80*
+; DLQ-NEXT:    [[TMP5:%.*]] = load x86_fp80, x86_fp80* [[TMP4]], align 1
+; DLQ-NEXT:    br label [[TMP8:%.*]]
+; DLQ:       6:
+; DLQ-NEXT:    [[TMP7:%.*]] = fpext double [[B]] to x86_fp80
+; DLQ-NEXT:    br label [[TMP8]]
+; DLQ:       8:
+; DLQ-NEXT:    [[TMP9:%.*]] = phi x86_fp80 [ [[TMP5]], [[TMP3]] ], [ [[TMP7]], [[TMP6]] ]
+; DLQ-NEXT:    [[C:%.*]] = fadd double [[B]], 1.000000e+00
+; DLQ-NEXT:    [[TMP10:%.*]] = fadd x86_fp80 [[TMP9]], 0xK3FFF8000000000000000
+; DLQ-NEXT:    [[TMP11:%.*]] = bitcast double* [[A]] to i8*
+; DLQ-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_get_shadow_ptr_for_double_store(i8* [[TMP11]], i64 1)
+; DLQ-NEXT:    [[TMP13:%.*]] = ptrtoint double* [[A]] to i64
+; DLQ-NEXT:    [[TMP14:%.*]] = call i32 @__nsan_internal_check_double_l(double [[C]], x86_fp80 [[TMP10]], i32 4, i64 [[TMP13]])
+; DLQ-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1
+; DLQ-NEXT:    [[TMP16:%.*]] = fpext double [[C]] to x86_fp80
+; DLQ-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], x86_fp80 [[TMP16]], x86_fp80 [[TMP10]]
+; DLQ-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP12]] to x86_fp80*
+; DLQ-NEXT:    store x86_fp80 [[TMP17]], x86_fp80* [[TMP18]], align 1
+; DLQ-NEXT:    store double [[C]], double* [[A]], align 1
+; DLQ-NEXT:    ret void
+;
+entry:
+  %b = load double, double* %a, align 1
+  %c = fadd double %b, 1.0
+  store double %c, double* %a, align 1
+  ret void
+}
+
+define void @load_add_store_vector(<2 x float>* %a) sanitize_numericalstability {
+; CHECK-LABEL: @load_add_store_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float>* [[A]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP0]], i64 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP1]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP6:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 1
+; CHECK-NEXT:    br label [[TMP8:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[B]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x double> [ [[TMP5]], [[TMP3]] ], [ [[TMP7]], [[TMP6]] ]
+; CHECK-NEXT:    [[C:%.*]] = fadd <2 x float> [[B]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP9]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x float>* [[A]] to i8*
+; CHECK-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP11]], i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[C]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint <2 x float>* [[A]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP13]], double [[TMP14]], i32 4, i64 [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[C]], i64 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP10]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint <2 x float>* [[A]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP17]], double [[TMP18]], i32 4, i64 [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP16]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP21]], 1
+; CHECK-NEXT:    [[TMP23:%.*]] = fpext <2 x float> [[C]] to <2 x double>
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], <2 x double> [[TMP23]], <2 x double> [[TMP10]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP12]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP24]], <2 x double>* [[TMP25]], align 1
+; CHECK-NEXT:    store <2 x float> [[C]], <2 x float>* [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load <2 x float>, <2 x float>* %a, align 1
+  %c = fadd <2 x float> %b, <float 1.0, float 1.0>
+  store <2 x float> %c, <2 x float>* %a, align 1
+  ret void
+}
+
+declare float @returns_float()
+
+define void @call_fn_returning_float(float* %dst) sanitize_numericalstability {
+; CHECK-LABEL: @call_fn_returning_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = call float @returns_float()
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (float ()* @returns_float to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    [[C:%.*]] = fadd float [[B]], 1.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP7:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP6]], i64 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint float* [[DST]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__nsan_internal_check_float_d(float [[C]], double [[TMP5]], i32 4, i64 [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP10]], double [[TMP11]], double [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP7]] to double*
+; CHECK-NEXT:    store double [[TMP12]], double* [[TMP13]], align 1
+; CHECK-NEXT:    store float [[C]], float* [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = call float @returns_float()
+  %c = fadd float %b, 1.0
+  store float %c, float* %dst, align 1
+  ret void
+}
+
+define float @return_fn_returning_float(float* %dst) sanitize_numericalstability {
+; CHECK-LABEL: @return_fn_returning_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = call float @returns_float()
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (float ()* @returns_float to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP4]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], double [[TMP7]], double [[TMP4]]
+; CHECK-NEXT:    store i64 ptrtoint (float (float*)* @return_fn_returning_float to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP8]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[B]]
+;
+entry:
+  %b = call float @returns_float()
+  ret float %b
+}
+
+declare void @takes_floats(float %a, i8 %b, double %c, x86_fp80 %d)
+
+define void @call_fn_taking_float() sanitize_numericalstability {
+; DQQ-LABEL: @call_fn_taking_float(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    store i64 ptrtoint (void (float, i8, double, x86_fp80)* @takes_floats to i64), i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    store double 1.000000e+00, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; DQQ-NEXT:    store fp128 0xL00000000000000004000800000000000, fp128* bitcast (i8* getelementptr inbounds ([16384 x i8], [16384 x i8]* @__nsan_shadow_args_ptr, i64 0, i64 8) to fp128*), align 1
+; DQQ-NEXT:    store fp128 0xLC0000000000000003FC9CD2B297D889B, fp128* bitcast (i8* getelementptr inbounds ([16384 x i8], [16384 x i8]* @__nsan_shadow_args_ptr, i64 0, i64 24) to fp128*), align 1
+; DQQ-NEXT:    call void @takes_floats(float 1.000000e+00, i8 2, double 3.000000e+00, x86_fp80 0xK3FC9E69594BEC44DE000)
+; DQQ-NEXT:    ret void
+;
+; DLQ-LABEL: @call_fn_taking_float(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    store i64 ptrtoint (void (float, i8, double, x86_fp80)* @takes_floats to i64), i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    store double 1.000000e+00, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; DLQ-NEXT:    store x86_fp80 0xK4000C000000000000000, x86_fp80* bitcast (i8* getelementptr inbounds ([16384 x i8], [16384 x i8]* @__nsan_shadow_args_ptr, i64 0, i64 8) to x86_fp80*), align 1
+; DLQ-NEXT:    store fp128 0xLC0000000000000003FC9CD2B297D889B, fp128* bitcast (i8* getelementptr inbounds ([16384 x i8], [16384 x i8]* @__nsan_shadow_args_ptr, i64 0, i64 18) to fp128*), align 1
+; DLQ-NEXT:    call void @takes_floats(float 1.000000e+00, i8 2, double 3.000000e+00, x86_fp80 0xK3FC9E69594BEC44DE000)
+; DLQ-NEXT:    ret void
+;
+entry:
+  call void @takes_floats(float 1.0, i8 2, double 3.0, x86_fp80 0xK3FC9E69594BEC44DE000)
+  ret void
+}
+
+declare float @llvm.sin.f32(float) readnone
+
+define float @call_sin_intrinsic() sanitize_numericalstability {
+; CHECK-LABEL: @call_sin_intrinsic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.sin.f32(float 1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (float ()* @call_sin_intrinsic to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.sin.f32(float 1.0)
+  ret float %r
+}
+
+declare float @sinf(float)
+
+define float @call_sinf_libfunc() sanitize_numericalstability {
+; CHECK-LABEL: @call_sinf_libfunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @sinf(float 1.000000e+00) #4
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (float ()* @call_sinf_libfunc to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @sinf(float 1.0)
+  ret float %r
+}
+
+declare double @sin(double)
+
+; FIXME: nsan uses `sin(double)` for fp128.
+define double @call_sin_libfunc() sanitize_numericalstability {
+; DQQ-LABEL: @call_sin_libfunc(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @sin(double 1.000000e+00) #4
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (double ()* @call_sin_libfunc to i64), i64* @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], fp128* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to fp128*), align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_sin_libfunc(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @sin(double 1.000000e+00) #4
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (double ()* @call_sin_libfunc to i64), i64* @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], x86_fp80* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to x86_fp80*), align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @sin(double 1.0)
+  ret double %r
+}
+
+declare double @frexp(double, i32*)
+
+define double @call_frexp_libfunc_nointrinsic(double %0, i32* nocapture %1) sanitize_numericalstability {
+; DQQ-LABEL: @call_frexp_libfunc_nointrinsic(
+; DQQ-NEXT:    [[TMP3:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], ptrtoint (double (double, i32*)* @call_frexp_libfunc_nointrinsic to i64)
+; DQQ-NEXT:    [[TMP5:%.*]] = load fp128, fp128* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to fp128*), align 1
+; DQQ-NEXT:    [[TMP6:%.*]] = fpext double [[TMP0:%.*]] to fp128
+; DQQ-NEXT:    [[TMP7:%.*]] = select i1 [[TMP4]], fp128 [[TMP5]], fp128 [[TMP6]]
+; DQQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP8:%.*]] = call i32 @__nsan_internal_check_double_q(double [[TMP0]], fp128 [[TMP7]], i32 2, i64 0)
+; DQQ-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+; DQQ-NEXT:    [[TMP10:%.*]] = fpext double [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], fp128 [[TMP10]], fp128 [[TMP7]]
+; DQQ-NEXT:    [[TMP12:%.*]] = tail call double @frexp(double [[TMP0]], i32* [[TMP1:%.*]])
+; DQQ-NEXT:    [[TMP13:%.*]] = load i64, i64* @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], ptrtoint (double (double, i32*)* @frexp to i64)
+; DQQ-NEXT:    [[TMP15:%.*]] = load fp128, fp128* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to fp128*), align 16
+; DQQ-NEXT:    [[TMP16:%.*]] = fpext double [[TMP12]] to fp128
+; DQQ-NEXT:    [[TMP17:%.*]] = select i1 [[TMP14]], fp128 [[TMP15]], fp128 [[TMP16]]
+; DQQ-NEXT:    [[TMP18:%.*]] = call i32 @__nsan_internal_check_double_q(double [[TMP12]], fp128 [[TMP17]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 1
+; DQQ-NEXT:    [[TMP20:%.*]] = fpext double [[TMP12]] to fp128
+; DQQ-NEXT:    [[TMP21:%.*]] = select i1 [[TMP19]], fp128 [[TMP20]], fp128 [[TMP17]]
+; DQQ-NEXT:    store i64 ptrtoint (double (double, i32*)* @call_frexp_libfunc_nointrinsic to i64), i64* @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP21]], fp128* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to fp128*), align 16
+; DQQ-NEXT:    ret double [[TMP12]]
+;
+; DLQ-LABEL: @call_frexp_libfunc_nointrinsic(
+; DLQ-NEXT:    [[TMP3:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], ptrtoint (double (double, i32*)* @call_frexp_libfunc_nointrinsic to i64)
+; DLQ-NEXT:    [[TMP5:%.*]] = load x86_fp80, x86_fp80* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to x86_fp80*), align 1
+; DLQ-NEXT:    [[TMP6:%.*]] = fpext double [[TMP0:%.*]] to x86_fp80
+; DLQ-NEXT:    [[TMP7:%.*]] = select i1 [[TMP4]], x86_fp80 [[TMP5]], x86_fp80 [[TMP6]]
+; DLQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP8:%.*]] = call i32 @__nsan_internal_check_double_l(double [[TMP0]], x86_fp80 [[TMP7]], i32 2, i64 0)
+; DLQ-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+; DLQ-NEXT:    [[TMP10:%.*]] = fpext double [[TMP0]] to x86_fp80
+; DLQ-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], x86_fp80 [[TMP10]], x86_fp80 [[TMP7]]
+; DLQ-NEXT:    [[TMP12:%.*]] = tail call double @frexp(double [[TMP0]], i32* [[TMP1:%.*]])
+; DLQ-NEXT:    [[TMP13:%.*]] = load i64, i64* @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], ptrtoint (double (double, i32*)* @frexp to i64)
+; DLQ-NEXT:    [[TMP15:%.*]] = load x86_fp80, x86_fp80* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to x86_fp80*), align 16
+; DLQ-NEXT:    [[TMP16:%.*]] = fpext double [[TMP12]] to x86_fp80
+; DLQ-NEXT:    [[TMP17:%.*]] = select i1 [[TMP14]], x86_fp80 [[TMP15]], x86_fp80 [[TMP16]]
+; DLQ-NEXT:    [[TMP18:%.*]] = call i32 @__nsan_internal_check_double_l(double [[TMP12]], x86_fp80 [[TMP17]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 1
+; DLQ-NEXT:    [[TMP20:%.*]] = fpext double [[TMP12]] to x86_fp80
+; DLQ-NEXT:    [[TMP21:%.*]] = select i1 [[TMP19]], x86_fp80 [[TMP20]], x86_fp80 [[TMP17]]
+; DLQ-NEXT:    store i64 ptrtoint (double (double, i32*)* @call_frexp_libfunc_nointrinsic to i64), i64* @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP21]], x86_fp80* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to x86_fp80*), align 16
+; DLQ-NEXT:    ret double [[TMP12]]
+;
+  %3 = tail call double @frexp(double %0, i32* %1)
+  ret double %3
+}
+
+define float @call_fn_taking_float_by_fn_ptr(float (float)* nocapture %fn_ptr) sanitize_numericalstability {
+; CHECK-LABEL: @call_fn_taking_float_by_fn_ptr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint float (float)* [[FN_PTR:%.*]] to i64
+; CHECK-NEXT:    store i64 [[TMP0]], i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    store double 1.000000e+00, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; CHECK-NEXT:    [[R:%.*]] = call float [[FN_PTR]](float 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint float (float)* [[FN_PTR]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], double [[TMP4]], double [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP6]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], double [[TMP9]], double [[TMP6]]
+; CHECK-NEXT:    store i64 ptrtoint (float (float (float)*)* @call_fn_taking_float_by_fn_ptr to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP10]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float %fn_ptr(float 1.0)
+  ret float %r
+}
+
+define void @store_float(float* %dst) sanitize_numericalstability {
+; CHECK-LABEL: @store_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP0]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+; CHECK-NEXT:    store double 4.200000e+01, double* [[TMP2]], align 1
+; CHECK-NEXT:    store float 4.200000e+01, float* [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  store float 42.0, float* %dst, align 1
+  ret void
+}
+
+define void @store_non_float(i32* %dst) sanitize_numericalstability {
+; CHECK-LABEL: @store_non_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 42, i32* [[DST:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DST]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[DST]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP1]], i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to double*
+; CHECK-NEXT:    store double 0x36F5000000000000, double* [[TMP3]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 42, i32* %dst, align 1
+  ret void
+}
+
+define i1 @inline_asm(double %0) sanitize_numericalstability {
+; DQQ-LABEL: @inline_asm(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (i1 (double)* @inline_asm to i64)
+; DQQ-NEXT:    [[TMP3:%.*]] = load fp128, fp128* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to fp128*), align 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[TMP0:%.*]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], fp128 [[TMP3]], fp128 [[TMP4]]
+; DQQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP6:%.*]] = call i32 asm "pmovmskb $1, $0", "=r,x,~{dirflag},~{fpsr},~{flags}"(double [[TMP0]])
+; DQQ-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
+; DQQ-NEXT:    [[TMP8:%.*]] = icmp slt i8 [[TMP7]], 0
+; DQQ-NEXT:    ret i1 [[TMP8]]
+;
+; DLQ-LABEL: @inline_asm(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (i1 (double)* @inline_asm to i64)
+; DLQ-NEXT:    [[TMP3:%.*]] = load x86_fp80, x86_fp80* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to x86_fp80*), align 1
+; DLQ-NEXT:    [[TMP4:%.*]] = fpext double [[TMP0:%.*]] to x86_fp80
+; DLQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP4]]
+; DLQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP6:%.*]] = call i32 asm "pmovmskb $1, $0", "=r,x,~{dirflag},~{fpsr},~{flags}"(double [[TMP0]])
+; DLQ-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
+; DLQ-NEXT:    [[TMP8:%.*]] = icmp slt i8 [[TMP7]], 0
+; DLQ-NEXT:    ret i1 [[TMP8]]
+;
+entry:
+  %1 = call i32 asm "pmovmskb $1, $0", "=r,x,~{dirflag},~{fpsr},~{flags}"(double %0)
+  %2 = trunc i32 %1 to i8
+  %3 = icmp slt i8 %2, 0
+  ret i1 %3
+}
+
+define void @vector_extract(<2 x float> %0) sanitize_numericalstability {
+; CHECK-LABEL: @vector_extract(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (void (<2 x float>)* @vector_extract to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to <2 x double>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP0:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x double> [[TMP3]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = extractelement <2 x float> %0, i32 1
+  ret void
+}
+
+define void @vector_insert(<2 x float> %0) sanitize_numericalstability {
+; CHECK-LABEL: @vector_insert(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (void (<2 x float>)* @vector_insert to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to <2 x double>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP0:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x double> [[TMP3]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP0]], float 1.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP5]], double 1.000000e+00, i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = insertelement <2 x float> %0, float 1.0, i32 1
+  ret void
+}
+
+
+define void @vector_shuffle(<2 x float> %0) sanitize_numericalstability {
+; CHECK-LABEL: @vector_shuffle(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (void (<2 x float>)* @vector_shuffle to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to <2 x double>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP0:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x double> [[TMP3]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = shufflevector <2 x float> %0, <2 x float> <float 1.0, float 1.0>, <2 x i32> <i32 1, i32 3>
+  ret void
+}
+
+define void @aggregate_extract({i32, {float, i1}} %0) sanitize_numericalstability {
+; CHECK-LABEL: @aggregate_extract(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, { float, i1 } } [[TMP0:%.*]], 1, 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = extractvalue {i32, {float, i1}} %0, 1, 0
+  ret void
+}
+
+define void @aggregate_insert({i32, {float, i1}} %0, float %1) sanitize_numericalstability {
+; CHECK-LABEL: @aggregate_insert(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], ptrtoint (void ({ i32, { float, i1 } }, float)* @aggregate_insert to i64)
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fpext float [[TMP1:%.*]] to double
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], double [[TMP4]], double [[TMP5]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP1]], double [[TMP6]], i32 5, i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext float [[TMP1]] to double
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], double [[TMP9]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { i32, { float, i1 } } [[TMP0:%.*]], float [[TMP1]], 1, 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %2 = insertvalue {i32, {float, i1}} %0, float %1, 1, 0
+  ret void
+}
+
+define void @aggregate_insert_avoid_const_check({i32, {float, i1}} %0) sanitize_numericalstability {
+; CHECK-LABEL: @aggregate_insert_avoid_const_check(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { i32, { float, i1 } } [[TMP0:%.*]], float 1.000000e+00, 1, 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = insertvalue {i32, {float, i1}} %0, float 1.0, 1, 0
+  ret void
+}
+
+
+declare float @fabsf(float)
+
+define float @sub_fabs(float %a, float %b) sanitize_numericalstability {
+; CHECK-LABEL: @sub_fabs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (float (float, float)* @sub_fabs to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, double* bitcast (i8* getelementptr inbounds ([16384 x i8], [16384 x i8]* @__nsan_shadow_args_ptr, i64 0, i64 8) to double*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext float [[B:%.*]] to double
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], double [[TMP5]], double [[TMP6]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[S:%.*]] = fsub float [[A]], [[B]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub double [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__nsan_internal_check_float_d(float [[S]], double [[TMP8]], i32 2, i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = fpext float [[S]] to double
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP10]], double [[TMP11]], double [[TMP8]]
+; CHECK-NEXT:    [[R:%.*]] = call float @fabsf(float [[S]]) #4
+; CHECK-NEXT:    [[TMP13:%.*]] = call double @llvm.fabs.f64(double [[TMP8]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP13]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], double [[TMP16]], double [[TMP13]]
+; CHECK-NEXT:    store i64 ptrtoint (float (float, float)* @sub_fabs to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP17]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %s = fsub float %a, %b
+  %r = call float @fabsf(float %s)
+  ret float %r
+}
+
+; Note that the `unsafe-fp-math` from the function attributes should be moved to
+; individual instructions, with the shadow instructions NOT getting the attribute.
+define float @param_add_return_float_unsafe_fp_math(float %a) #0 {
+; CHECK-LABEL: @param_add_return_float_unsafe_fp_math(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (float (float)* @param_add_return_float_unsafe_fp_math to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[B:%.*]] = fadd fast float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP5]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext float [[B]] to double
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], double [[TMP8]], double [[TMP5]]
+; CHECK-NEXT:    store i64 ptrtoint (float (float)* @param_add_return_float_unsafe_fp_math to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP9]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[B]]
+;
+entry:
+  %b = fadd float %a, 1.0
+  ret float %b
+}
+
+
+define void @truncate(<2 x double> %0) sanitize_numericalstability {
+; DQQ-LABEL: @truncate(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (void (<2 x double>)* @truncate to i64)
+; DQQ-NEXT:    [[TMP3:%.*]] = load <2 x fp128>, <2 x fp128>* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to <2 x fp128>*), align 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext <2 x double> [[TMP0:%.*]] to <2 x fp128>
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x fp128> [[TMP3]], <2 x fp128> [[TMP4]]
+; DQQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP6:%.*]] = fptrunc <2 x double> [[TMP0]] to <2 x float>
+; DQQ-NEXT:    [[TMP7:%.*]] = fptrunc <2 x fp128> [[TMP5]] to <2 x double>
+; DQQ-NEXT:    ret void
+;
+; DLQ-LABEL: @truncate(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (void (<2 x double>)* @truncate to i64)
+; DLQ-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, <2 x x86_fp80>* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to <2 x x86_fp80>*), align 1
+; DLQ-NEXT:    [[TMP4:%.*]] = fpext <2 x double> [[TMP0:%.*]] to <2 x x86_fp80>
+; DLQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], <2 x x86_fp80> [[TMP3]], <2 x x86_fp80> [[TMP4]]
+; DLQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP6:%.*]] = fptrunc <2 x double> [[TMP0]] to <2 x float>
+; DLQ-NEXT:    [[TMP7:%.*]] = fptrunc <2 x x86_fp80> [[TMP5]] to <2 x double>
+; DLQ-NEXT:    ret void
+;
+entry:
+  %1 = fptrunc <2 x double> %0 to  <2 x float>
+  ret void
+}
+
+define void @unaryop(float %a) sanitize_numericalstability {
+; CHECK-LABEL: @unaryop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (void (float)* @unaryop to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[C:%.*]] = fneg float [[A]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP4]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = fneg float %a
+  ret void
+}
+
+
+attributes #0 = { nounwind readonly uwtable sanitize_numericalstability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/cfg.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/cfg.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/cfg.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -nsan -nsan-shadow-type-mapping=dqq -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests with more involved control flow to check lazy construction of the
+; shadow values.
+
+define float @forward_use() sanitize_numericalstability {
+; CHECK-LABEL: @forward_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BLOCK1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[B:%.*]], 2.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd double [[TMP2:%.*]], 2.000000e+00
+; CHECK-NEXT:    br label [[BLOCK1]]
+; CHECK:       block1:
+; CHECK-NEXT:    [[A:%.*]] = phi float [ [[D]], [[LOOP:%.*]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[TMP0]], [[LOOP]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[B]] = fadd float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[TMP2]] = fadd double [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %block1
+
+loop:
+  %d = fadd float %b, 2.0 ; this is a forward reference, requiring shadow(%b) to be available.
+  br label %block1
+
+block1:
+  %a = phi float [ %d, %loop], [ 1.0, %entry ]
+  %b = fadd float %a, 1.0
+  br label %loop
+}
+
+define float @forward_use_with_load(float* %p) sanitize_numericalstability {
+; CHECK-LABEL: @forward_use_with_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BLOCK1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[B:%.*]], 2.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd double [[TMP12:%.*]], 2.000000e+00
+; CHECK-NEXT:    br label [[BLOCK1]]
+; CHECK:       block1:
+; CHECK-NEXT:    [[A:%.*]] = phi float [ [[D]], [[LOOP:%.*]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[TMP0]], [[LOOP]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[L:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[P]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP2]], i64 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i8* [[TMP3]], null
+; CHECK-NEXT:    br i1 [[TMP4]], label [[TMP8:%.*]], label [[TMP5:%.*]]
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP3]] to double*
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP6]], align 1
+; CHECK-NEXT:    br label [[TMP10:%.*]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext float [[L]] to double
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi double [ [[TMP7]], [[TMP5]] ], [ [[TMP9]], [[TMP8]] ]
+; CHECK-NEXT:    [[B]] = fadd float [[L]], 1.000000e+00
+; CHECK-NEXT:    [[TMP12]] = fadd double [[TMP11]], 1.000000e+00
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %block1
+
+loop:
+  %d = fadd float %b, 2.0 ; this is a forward reference, requiring shadow(%b) to be available.
+  br label %block1
+
+block1:
+  %a = phi float [ %d, %loop], [ 1.0, %entry ]
+  %l = load float, float* %p ; the load creates a new block
+  %b = fadd float %l, 1.0 ; this requires shadow(%l).
+  br label %loop
+}
+
+define float @forward_use_with_two_uses() sanitize_numericalstability {
+; CHECK-LABEL: @forward_use_with_two_uses(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BLOCK1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[B:%.*]], 2.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd double [[TMP4:%.*]], 2.000000e+00
+; CHECK-NEXT:    br label [[BLOCK1]]
+; CHECK:       block1:
+; CHECK-NEXT:    [[A:%.*]] = phi float [ [[D]], [[LOOP:%.*]] ], [ 1.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ [[TMP0]], [[LOOP]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[T1:%.*]] = fadd float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd double [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[T2:%.*]] = fadd float [[T1]], 3.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP2]], 3.000000e+00
+; CHECK-NEXT:    [[B]] = fadd float [[T1]], [[T2]]
+; CHECK-NEXT:    [[TMP4]] = fadd double [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %block1
+
+loop:
+  %d = fadd float %b, 2.0 ; this is a forward reference, requiring shadow(%b) to be available.
+  br label %block1
+
+block1:
+  %a = phi float [ %d, %loop], [ 1.0, %entry ]
+  %t1 = fadd float %a, 1.0
+  %t2 = fadd float %t1, 3.0 ; this requires shadow(%t1)
+  %b = fadd float %t1, %t2 ; this requires shadow(%t2) and shadow(%t1).
+  br label %loop
+}
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/fcmp.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/fcmp.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/fcmp.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -nsan -nsan-shadow-type-mapping=dqq -nsan-truncate-fcmp-eq=false -S | FileCheck %s --check-prefixes=CHECK,DQQ
+; RUN: opt < %s -nsan -nsan-shadow-type-mapping=dlq -nsan-truncate-fcmp-eq=false -S | FileCheck %s --check-prefixes=CHECK,DLQ
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Scalar float comparison: `a > b`.
+define i1 @scalar_fcmp(double %a) sanitize_numericalstability {
+; DQQ-LABEL: @scalar_fcmp(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (i1 (double)* @scalar_fcmp to i64)
+; DQQ-NEXT:    [[TMP2:%.*]] = load fp128, fp128* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to fp128*), align 1
+; DQQ-NEXT:    [[TMP3:%.*]] = fpext double [[A:%.*]] to fp128
+; DQQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], fp128 [[TMP2]], fp128 [[TMP3]]
+; DQQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[R:%.*]] = fcmp oeq double [[A]], 1.000000e+00
+; DQQ-NEXT:    [[TMP5:%.*]] = fcmp oeq fp128 [[TMP4]], 0xL00000000000000003FFF000000000000
+; DQQ-NEXT:    [[TMP6:%.*]] = icmp eq i1 [[R]], [[TMP5]]
+; DQQ-NEXT:    br i1 [[TMP6]], label [[TMP8:%.*]], label [[TMP7:%.*]]
+; DQQ:       7:
+; DQQ-NEXT:    call void @__nsan_fcmp_fail_double_q(double [[A]], double 1.000000e+00, fp128 [[TMP4]], fp128 0xL00000000000000003FFF000000000000, i32 1, i1 [[R]], i1 [[TMP5]])
+; DQQ-NEXT:    br label [[TMP8]]
+; DQQ:       8:
+; DQQ-NEXT:    ret i1 [[R]]
+;
+; DLQ-LABEL: @scalar_fcmp(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (i1 (double)* @scalar_fcmp to i64)
+; DLQ-NEXT:    [[TMP2:%.*]] = load x86_fp80, x86_fp80* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to x86_fp80*), align 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[A:%.*]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], x86_fp80 [[TMP2]], x86_fp80 [[TMP3]]
+; DLQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[R:%.*]] = fcmp oeq double [[A]], 1.000000e+00
+; DLQ-NEXT:    [[TMP5:%.*]] = fcmp oeq x86_fp80 [[TMP4]], 0xK3FFF8000000000000000
+; DLQ-NEXT:    [[TMP6:%.*]] = icmp eq i1 [[R]], [[TMP5]]
+; DLQ-NEXT:    br i1 [[TMP6]], label [[TMP8:%.*]], label [[TMP7:%.*]]
+; DLQ:       7:
+; DLQ-NEXT:    call void @__nsan_fcmp_fail_double_l(double [[A]], double 1.000000e+00, x86_fp80 [[TMP4]], x86_fp80 0xK3FFF8000000000000000, i32 1, i1 [[R]], i1 [[TMP5]])
+; DLQ-NEXT:    br label [[TMP8]]
+; DLQ:       8:
+; DLQ-NEXT:    ret i1 [[R]]
+;
+entry:
+  %r = fcmp oeq double %a, 1.0
+  ret i1 %r
+}
+
+; Vector float comparison.
+define <4 x i1> @vector_fcmp(<4 x double> %a, <4 x double> %b) sanitize_numericalstability {
+; DQQ-LABEL: @vector_fcmp(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (<4 x i1> (<4 x double>, <4 x double>)* @vector_fcmp to i64)
+; DQQ-NEXT:    [[TMP2:%.*]] = load <4 x fp128>, <4 x fp128>* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to <4 x fp128>*), align 1
+; DQQ-NEXT:    [[TMP3:%.*]] = fpext <4 x double> [[A:%.*]] to <4 x fp128>
+; DQQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], <4 x fp128> [[TMP2]], <4 x fp128> [[TMP3]]
+; DQQ-NEXT:    [[TMP5:%.*]] = load <4 x fp128>, <4 x fp128>* bitcast (i8* getelementptr inbounds ([16384 x i8], [16384 x i8]* @__nsan_shadow_args_ptr, i64 0, i64 64) to <4 x fp128>*), align 1
+; DQQ-NEXT:    [[TMP6:%.*]] = fpext <4 x double> [[B:%.*]] to <4 x fp128>
+; DQQ-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], <4 x fp128> [[TMP5]], <4 x fp128> [[TMP6]]
+; DQQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DQQ-NEXT:    [[R:%.*]] = fcmp oeq <4 x double> [[A]], [[B]]
+; DQQ-NEXT:    [[TMP8:%.*]] = fcmp oeq <4 x fp128> [[TMP4]], [[TMP7]]
+; DQQ-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i1> [[R]], [[TMP8]]
+; DQQ-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP9]])
+; DQQ-NEXT:    br i1 [[TMP10]], label [[TMP36:%.*]], label [[TMP11:%.*]]
+; DQQ:       11:
+; DQQ-NEXT:    [[TMP12:%.*]] = extractelement <4 x double> [[A]], i64 0
+; DQQ-NEXT:    [[TMP13:%.*]] = extractelement <4 x double> [[B]], i64 0
+; DQQ-NEXT:    [[TMP14:%.*]] = extractelement <4 x fp128> [[TMP4]], i64 0
+; DQQ-NEXT:    [[TMP15:%.*]] = extractelement <4 x fp128> [[TMP7]], i64 0
+; DQQ-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[R]], i64 0
+; DQQ-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i64 0
+; DQQ-NEXT:    call void @__nsan_fcmp_fail_double_q(double [[TMP12]], double [[TMP13]], fp128 [[TMP14]], fp128 [[TMP15]], i32 1, i1 [[TMP16]], i1 [[TMP17]])
+; DQQ-NEXT:    [[TMP18:%.*]] = extractelement <4 x double> [[A]], i64 1
+; DQQ-NEXT:    [[TMP19:%.*]] = extractelement <4 x double> [[B]], i64 1
+; DQQ-NEXT:    [[TMP20:%.*]] = extractelement <4 x fp128> [[TMP4]], i64 1
+; DQQ-NEXT:    [[TMP21:%.*]] = extractelement <4 x fp128> [[TMP7]], i64 1
+; DQQ-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[R]], i64 1
+; DQQ-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i64 1
+; DQQ-NEXT:    call void @__nsan_fcmp_fail_double_q(double [[TMP18]], double [[TMP19]], fp128 [[TMP20]], fp128 [[TMP21]], i32 1, i1 [[TMP22]], i1 [[TMP23]])
+; DQQ-NEXT:    [[TMP24:%.*]] = extractelement <4 x double> [[A]], i64 2
+; DQQ-NEXT:    [[TMP25:%.*]] = extractelement <4 x double> [[B]], i64 2
+; DQQ-NEXT:    [[TMP26:%.*]] = extractelement <4 x fp128> [[TMP4]], i64 2
+; DQQ-NEXT:    [[TMP27:%.*]] = extractelement <4 x fp128> [[TMP7]], i64 2
+; DQQ-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[R]], i64 2
+; DQQ-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP8]], i64 2
+; DQQ-NEXT:    call void @__nsan_fcmp_fail_double_q(double [[TMP24]], double [[TMP25]], fp128 [[TMP26]], fp128 [[TMP27]], i32 1, i1 [[TMP28]], i1 [[TMP29]])
+; DQQ-NEXT:    [[TMP30:%.*]] = extractelement <4 x double> [[A]], i64 3
+; DQQ-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[B]], i64 3
+; DQQ-NEXT:    [[TMP32:%.*]] = extractelement <4 x fp128> [[TMP4]], i64 3
+; DQQ-NEXT:    [[TMP33:%.*]] = extractelement <4 x fp128> [[TMP7]], i64 3
+; DQQ-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[R]], i64 3
+; DQQ-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP8]], i64 3
+; DQQ-NEXT:    call void @__nsan_fcmp_fail_double_q(double [[TMP30]], double [[TMP31]], fp128 [[TMP32]], fp128 [[TMP33]], i32 1, i1 [[TMP34]], i1 [[TMP35]])
+; DQQ-NEXT:    br label [[TMP36]]
+; DQQ:       36:
+; DQQ-NEXT:    ret <4 x i1> [[R]]
+;
+; DLQ-LABEL: @vector_fcmp(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (<4 x i1> (<4 x double>, <4 x double>)* @vector_fcmp to i64)
+; DLQ-NEXT:    [[TMP2:%.*]] = load <4 x x86_fp80>, <4 x x86_fp80>* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to <4 x x86_fp80>*), align 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext <4 x double> [[A:%.*]] to <4 x x86_fp80>
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], <4 x x86_fp80> [[TMP2]], <4 x x86_fp80> [[TMP3]]
+; DLQ-NEXT:    [[TMP5:%.*]] = load <4 x x86_fp80>, <4 x x86_fp80>* bitcast (i8* getelementptr inbounds ([16384 x i8], [16384 x i8]* @__nsan_shadow_args_ptr, i64 0, i64 40) to <4 x x86_fp80>*), align 1
+; DLQ-NEXT:    [[TMP6:%.*]] = fpext <4 x double> [[B:%.*]] to <4 x x86_fp80>
+; DLQ-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], <4 x x86_fp80> [[TMP5]], <4 x x86_fp80> [[TMP6]]
+; DLQ-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; DLQ-NEXT:    [[R:%.*]] = fcmp oeq <4 x double> [[A]], [[B]]
+; DLQ-NEXT:    [[TMP8:%.*]] = fcmp oeq <4 x x86_fp80> [[TMP4]], [[TMP7]]
+; DLQ-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i1> [[R]], [[TMP8]]
+; DLQ-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP9]])
+; DLQ-NEXT:    br i1 [[TMP10]], label [[TMP36:%.*]], label [[TMP11:%.*]]
+; DLQ:       11:
+; DLQ-NEXT:    [[TMP12:%.*]] = extractelement <4 x double> [[A]], i64 0
+; DLQ-NEXT:    [[TMP13:%.*]] = extractelement <4 x double> [[B]], i64 0
+; DLQ-NEXT:    [[TMP14:%.*]] = extractelement <4 x x86_fp80> [[TMP4]], i64 0
+; DLQ-NEXT:    [[TMP15:%.*]] = extractelement <4 x x86_fp80> [[TMP7]], i64 0
+; DLQ-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[R]], i64 0
+; DLQ-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i64 0
+; DLQ-NEXT:    call void @__nsan_fcmp_fail_double_l(double [[TMP12]], double [[TMP13]], x86_fp80 [[TMP14]], x86_fp80 [[TMP15]], i32 1, i1 [[TMP16]], i1 [[TMP17]])
+; DLQ-NEXT:    [[TMP18:%.*]] = extractelement <4 x double> [[A]], i64 1
+; DLQ-NEXT:    [[TMP19:%.*]] = extractelement <4 x double> [[B]], i64 1
+; DLQ-NEXT:    [[TMP20:%.*]] = extractelement <4 x x86_fp80> [[TMP4]], i64 1
+; DLQ-NEXT:    [[TMP21:%.*]] = extractelement <4 x x86_fp80> [[TMP7]], i64 1
+; DLQ-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[R]], i64 1
+; DLQ-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i64 1
+; DLQ-NEXT:    call void @__nsan_fcmp_fail_double_l(double [[TMP18]], double [[TMP19]], x86_fp80 [[TMP20]], x86_fp80 [[TMP21]], i32 1, i1 [[TMP22]], i1 [[TMP23]])
+; DLQ-NEXT:    [[TMP24:%.*]] = extractelement <4 x double> [[A]], i64 2
+; DLQ-NEXT:    [[TMP25:%.*]] = extractelement <4 x double> [[B]], i64 2
+; DLQ-NEXT:    [[TMP26:%.*]] = extractelement <4 x x86_fp80> [[TMP4]], i64 2
+; DLQ-NEXT:    [[TMP27:%.*]] = extractelement <4 x x86_fp80> [[TMP7]], i64 2
+; DLQ-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[R]], i64 2
+; DLQ-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP8]], i64 2
+; DLQ-NEXT:    call void @__nsan_fcmp_fail_double_l(double [[TMP24]], double [[TMP25]], x86_fp80 [[TMP26]], x86_fp80 [[TMP27]], i32 1, i1 [[TMP28]], i1 [[TMP29]])
+; DLQ-NEXT:    [[TMP30:%.*]] = extractelement <4 x double> [[A]], i64 3
+; DLQ-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[B]], i64 3
+; DLQ-NEXT:    [[TMP32:%.*]] = extractelement <4 x x86_fp80> [[TMP4]], i64 3
+; DLQ-NEXT:    [[TMP33:%.*]] = extractelement <4 x x86_fp80> [[TMP7]], i64 3
+; DLQ-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[R]], i64 3
+; DLQ-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP8]], i64 3
+; DLQ-NEXT:    call void @__nsan_fcmp_fail_double_l(double [[TMP30]], double [[TMP31]], x86_fp80 [[TMP32]], x86_fp80 [[TMP33]], i32 1, i1 [[TMP34]], i1 [[TMP35]])
+; DLQ-NEXT:    br label [[TMP36]]
+; DLQ:       36:
+; DLQ-NEXT:    ret <4 x i1> [[R]]
+;
+entry:
+  %r = fcmp oeq <4 x double> %a, %b
+  ret <4 x i1> %r
+}
+
+declare float @fabsf(float)
+
+; Basic scalar float comparison of absolute difference with 0: `fabs(a-b) > 0`.
+define float @sub_cmp_fabs(float %a, float %b) sanitize_numericalstability {
+; CHECK-LABEL: @sub_cmp_fabs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (float (float, float)* @sub_cmp_fabs to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* bitcast ([16384 x i8]* @__nsan_shadow_args_ptr to double*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, double* bitcast (i8* getelementptr inbounds ([16384 x i8], [16384 x i8]* @__nsan_shadow_args_ptr, i64 0, i64 8) to double*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext float [[B:%.*]] to double
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], double [[TMP5]], double [[TMP6]]
+; CHECK-NEXT:    store i64 0, i64* @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[S:%.*]] = fsub float [[A]], [[B]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub double [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[R:%.*]] = call float @fabsf(float [[S]]) [[ATTR4:#.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fabs.f64(double [[TMP8]])
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq float [[R]], 2.500000e-01
+; CHECK-NEXT:    [[TMP10:%.*]] = fcmp oeq double [[TMP9]], 2.500000e-01
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i1 [[C]], [[TMP10]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP13:%.*]], label [[TMP12:%.*]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__nsan_fcmp_fail_float_d(float [[R]], float 2.500000e-01, double [[TMP9]], double 2.500000e-01, i32 1, i1 [[C]], i1 [[TMP10]])
+; CHECK-NEXT:    br label [[TMP13]]
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP9]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], double [[TMP16]], double [[TMP9]]
+; CHECK-NEXT:    store i64 ptrtoint (float (float, float)* @sub_cmp_fabs to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP17]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %s = fsub float %a, %b
+  %r = call float @fabsf(float %s)
+  %c = fcmp oeq float %r, 0.25
+  ret float %r
+}
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/invoke.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/invoke.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/invoke.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -nsan -nsan-shadow-type-mapping=dqq -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests for invoke instructions that require special handling of the phis.
+
+declare float @may_throw()
+
+declare void @personalityFn()
+
+define float @invoke1() sanitize_numericalstability personality void ()* @personalityFn {
+; CHECK-LABEL: @invoke1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = invoke float @may_throw()
+; CHECK-NEXT:    to label [[TMP0:%.*]] unwind label [[LAND:%.*]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (float ()* @may_throw to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP4]]
+; CHECK-NEXT:    br label [[CONTINUE:%.*]]
+; CHECK:       continue:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       land:
+; CHECK-NEXT:    [[RES:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    [[LV:%.*]] = uitofp i32 1 to float
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R:%.*]] = phi float [ [[LV]], [[LAND]] ], [ [[C]], [[CONTINUE]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi double [ 1.000000e+00, [[LAND]] ], [ [[TMP5]], [[CONTINUE]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP6]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], double [[TMP9]], double [[TMP6]]
+; CHECK-NEXT:    store i64 ptrtoint (float ()* @invoke1 to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP10]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+
+entry:
+  %c = invoke float @may_throw() to label %continue unwind label %land
+
+continue:
+  br label %exit
+
+land:
+  %res = landingpad { i8*, i32 } cleanup
+  %lv = uitofp i32 1 to float
+  br label %exit
+
+exit:
+  %r = phi float [ %lv, %land], [ %c, %continue ]
+  ret float %r
+}
+
+define float @invoke2() sanitize_numericalstability personality void ()* @personalityFn {
+; CHECK-LABEL: @invoke2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = invoke float @may_throw()
+; CHECK-NEXT:    to label [[TMP0:%.*]] unwind label [[LAND:%.*]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (float ()* @may_throw to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP4]]
+; CHECK-NEXT:    br label [[CONTINUE:%.*]]
+; CHECK:       continue:
+; CHECK-NEXT:    [[CV:%.*]] = fadd float [[C]], 2.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd double [[TMP5]], 2.000000e+00
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       land:
+; CHECK-NEXT:    [[RES:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    [[LV:%.*]] = uitofp i32 1 to float
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R:%.*]] = phi float [ [[LV]], [[LAND]] ], [ [[CV]], [[CONTINUE]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi double [ 1.000000e+00, [[LAND]] ], [ [[TMP6]], [[CONTINUE]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP7]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], double [[TMP10]], double [[TMP7]]
+; CHECK-NEXT:    store i64 ptrtoint (float ()* @invoke2 to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP11]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+
+entry:
+  %c = invoke float @may_throw() to label %continue unwind label %land
+
+continue:
+  %cv = fadd float %c, 2.0
+  br label %exit
+
+land:
+  %res = landingpad { i8*, i32 } cleanup
+  %lv = uitofp i32 1 to float
+  br label %exit
+
+exit:
+  %r = phi float [ %lv, %land], [ %cv, %continue ]
+  ret float %r
+}
+
+define float @invoke3() sanitize_numericalstability personality void ()* @personalityFn {
+; CHECK-LABEL: @invoke3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = invoke float @may_throw()
+; CHECK-NEXT:    to label [[TMP0:%.*]] unwind label [[LAND:%.*]]
+; CHECK:       land:
+; CHECK-NEXT:    [[RES:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    [[LV:%.*]] = uitofp i32 1 to float
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], ptrtoint (float ()* @may_throw to i64)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[C]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP4]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R:%.*]] = phi float [ [[LV]], [[LAND]] ], [ [[C]], [[TMP0]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi double [ 1.000000e+00, [[LAND]] ], [ [[TMP5]], [[TMP0]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP6]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], double [[TMP9]], double [[TMP6]]
+; CHECK-NEXT:    store i64 ptrtoint (float ()* @invoke3 to i64), i64* @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP10]], double* bitcast ([128 x i8]* @__nsan_shadow_ret_ptr to double*), align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+
+entry:
+  %c = invoke float @may_throw() to label %exit unwind label %land
+
+land:
+  %res = landingpad { i8*, i32 } cleanup
+  %lv = uitofp i32 1 to float
+  br label %exit
+
+exit:
+  %r = phi float [ %lv, %land], [ %c, %entry ]
+  ret float %r
+}
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/memory.ll
@@ -0,0 +1,476 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64-linux-gnu < %s -nsan -nsan-shadow-type-mapping=dqq -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Tests with memory manipulation (memcpy, llvm.memcpy, ...).
+
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i1)
+
+define void @call_memcpy_intrinsic(i8* nonnull align 8 dereferenceable(16) %a, i8* nonnull align 8 dereferenceable(16) %b) sanitize_numericalstability {
+; CHECK-LABEL: @call_memcpy_intrinsic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @__nsan_copy_values(i8* [[A:%.*]], i8* [[B:%.*]], i64 16)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(16) [[A]], i8* nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(16) %a, i8* nonnull align 8 dereferenceable(16) %b, i64 16, i1 false)
+  ret void
+}
+
+declare dso_local i8* @memcpy(i8*, i8*, i64) local_unnamed_addr
+
+define void @call_memcpy(i8* nonnull align 8 dereferenceable(16) %a, i8* nonnull align 8 dereferenceable(16) %b) sanitize_numericalstability {
+; CHECK-LABEL: @call_memcpy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8* @memcpy(i8* nonnull align 8 dereferenceable(16) [[A:%.*]], i8* nonnull align 8 dereferenceable(16) [[B:%.*]], i64 16) #3
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call i8* @memcpy(i8* nonnull align 8 dereferenceable(16) %a, i8* nonnull align 8 dereferenceable(16) %b, i64 16)
+  ret void
+}
+
+
+define void @transfer_float(float* %dst, float* %src) sanitize_numericalstability {
+; CHECK-LABEL: @transfer_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T:%.*]] = load float, float* [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP0]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP1]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP6:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1]] to double*
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[TMP4]], align 1
+; CHECK-NEXT:    br label [[TMP8:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext float [[T]] to double
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi double [ [[TMP5]], [[TMP3]] ], [ [[TMP7]], [[TMP6]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP11:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP10]], i64 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint float* [[DST]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @__nsan_internal_check_float_d(float [[T]], double [[TMP9]], i32 4, i64 [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = fpext float [[T]] to double
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP14]], double [[TMP15]], double [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP11]] to double*
+; CHECK-NEXT:    store double [[TMP16]], double* [[TMP17]], align 1
+; CHECK-NEXT:    store float [[T]], float* [[DST]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t = load float, float* %src
+  store float %t, float* %dst, align 1
+  ret void
+}
+
+define void @transfer_non_float(i32* %dst, i32* %src) sanitize_numericalstability {
+; CHECK-LABEL: @transfer_non_float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to i64*
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP5]], align 1
+; CHECK-NEXT:    store i32 [[T]], i32* [[DST:%.*]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[DST]] to i8*
+; CHECK-NEXT:    [[TMP8:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
+; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP7]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i64*
+; CHECK-NEXT:    store i64 [[TMP6]], i64* [[TMP11]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t = load i32, i32* %src
+  store i32 %t, i32* %dst, align 1
+  ret void
+}
+
+define void @transfer_array([2 x float]* %a) sanitize_numericalstability {
+; CHECK-LABEL: @transfer_array(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = load [2 x float], [2 x float]* [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [2 x float]* [[A]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to i128*
+; CHECK-NEXT:    [[TMP6:%.*]] = load i128, i128* [[TMP5]], align 1
+; CHECK-NEXT:    store [2 x float] [[B]], [2 x float]* [[A]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast [2 x float]* [[A]] to i8*
+; CHECK-NEXT:    [[TMP8:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
+; CHECK-NEXT:    store i64 [[TMP3]], i64* [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP7]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i128*
+; CHECK-NEXT:    store i128 [[TMP6]], i128* [[TMP11]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = load [2 x float], [2 x float]* %a, align 1
+  store [2 x float] %b, [2 x float]* %a, align 1
+  ret void
+}
+
+define void @swap_untyped1(i64* nonnull align 8 %p, i64* nonnull align 8 %q) sanitize_numericalstability {
+; CHECK-LABEL: @swap_untyped1(
+; CHECK-NEXT:    [[QV:%.*]] = load i64, i64* [[Q:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to i64*
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to i128*
+; CHECK-NEXT:    [[TMP7:%.*]] = load i128, i128* [[TMP6]], align 1
+; CHECK-NEXT:    [[PV:%.*]] = load i64, i64* [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[P]] to i8*
+; CHECK-NEXT:    [[TMP9:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i64*
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP8]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i128*
+; CHECK-NEXT:    [[TMP14:%.*]] = load i128, i128* [[TMP13]], align 1
+; CHECK-NEXT:    store i64 [[PV]], i64* [[Q]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP16:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i64*
+; CHECK-NEXT:    store i64 [[TMP11]], i64* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP15]])
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to i128*
+; CHECK-NEXT:    store i128 [[TMP14]], i128* [[TMP19]], align 1
+; CHECK-NEXT:    store i64 [[QV]], i64* [[P]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64* [[P]] to i8*
+; CHECK-NEXT:    [[TMP21:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP20]])
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP21]] to i64*
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP20]])
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP23]] to i128*
+; CHECK-NEXT:    store i128 [[TMP7]], i128* [[TMP24]], align 1
+; CHECK-NEXT:    ret void
+;
+  %qv = load i64, i64* %q
+  %pv = load i64, i64* %p
+  store i64 %pv, i64* %q, align 8
+  store i64 %qv, i64* %p, align 8
+  ret void
+}
+
+; Same as swap_untyped1, but the load/stores are in the opposite order.
+define void @swap_untyped2(i64* nonnull align 8 %p, i64* nonnull align 8 %q) sanitize_numericalstability {
+; CHECK-LABEL: @swap_untyped2(
+; CHECK-NEXT:    [[PV:%.*]] = load i64, i64* [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to i64*
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to i128*
+; CHECK-NEXT:    [[TMP7:%.*]] = load i128, i128* [[TMP6]], align 1
+; CHECK-NEXT:    [[QV:%.*]] = load i64, i64* [[Q:%.*]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP9:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i64*
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP8]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i128*
+; CHECK-NEXT:    [[TMP14:%.*]] = load i128, i128* [[TMP13]], align 1
+; CHECK-NEXT:    store i64 [[PV]], i64* [[Q]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i64* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP16:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i64*
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP15]])
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to i128*
+; CHECK-NEXT:    store i128 [[TMP7]], i128* [[TMP19]], align 1
+; CHECK-NEXT:    store i64 [[QV]], i64* [[P]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64* [[P]] to i8*
+; CHECK-NEXT:    [[TMP21:%.*]] = call i8* @__nsan_internal_get_raw_shadow_type_ptr(i8* [[TMP20]])
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP21]] to i64*
+; CHECK-NEXT:    store i64 [[TMP11]], i64* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = call i8* @__nsan_internal_get_raw_shadow_ptr(i8* [[TMP20]])
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP23]] to i128*
+; CHECK-NEXT:    store i128 [[TMP14]], i128* [[TMP24]], align 1
+; CHECK-NEXT:    ret void
+;
+  %pv = load i64, i64* %p
+  %qv = load i64, i64* %q
+  store i64 %pv, i64* %q, align 8
+  store i64 %qv, i64* %p, align 8
+  ret void
+}
+
+define void @swap_ft1(float* nonnull align 8 %p, float* nonnull align 8 %q) sanitize_numericalstability {
+; CHECK-LABEL: @swap_ft1(
+; CHECK-NEXT:    [[QV:%.*]] = load float, float* [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP1]], i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8* [[TMP2]], null
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP4:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to double*
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, double* [[TMP5]], align 1
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext float [[QV]] to double
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi double [ [[TMP6]], [[TMP4]] ], [ [[TMP8]], [[TMP7]] ]
+; CHECK-NEXT:    [[PV:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[P]] to i8*
+; CHECK-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP11]], i64 1)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i8* [[TMP12]], null
+; CHECK-NEXT:    br i1 [[TMP13]], label [[TMP17:%.*]], label [[TMP14:%.*]]
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP12]] to double*
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, double* [[TMP15]], align 1
+; CHECK-NEXT:    br label [[TMP19:%.*]]
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = fpext float [[PV]] to double
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP16]], [[TMP14]] ], [ [[TMP18]], [[TMP17]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP22:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP21]], i64 1)
+; CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint float* [[Q]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @__nsan_internal_check_float_d(float [[PV]], double [[TMP20]], i32 4, i64 [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP24]], 1
+; CHECK-NEXT:    [[TMP26:%.*]] = fpext float [[PV]] to double
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], double [[TMP26]], double [[TMP20]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP22]] to double*
+; CHECK-NEXT:    store double [[TMP27]], double* [[TMP28]], align 1
+; CHECK-NEXT:    store float [[PV]], float* [[Q]], align 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast float* [[P]] to i8*
+; CHECK-NEXT:    [[TMP30:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP29]], i64 1)
+; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint float* [[P]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @__nsan_internal_check_float_d(float [[QV]], double [[TMP10]], i32 4, i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[TMP32]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = fpext float [[QV]] to double
+; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], double [[TMP34]], double [[TMP10]]
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast i8* [[TMP30]] to double*
+; CHECK-NEXT:    store double [[TMP35]], double* [[TMP36]], align 1
+; CHECK-NEXT:    store float [[QV]], float* [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %qv = load float, float* %q
+  %pv = load float, float* %p
+  store float %pv, float* %q, align 8
+  store float %qv, float* %p, align 8
+  ret void
+}
+
+; Same as swap_ft1, but the load/stores are in the opposite order.
+define void @swap_ft2(float* nonnull align 8 %p, float* nonnull align 8 %q) sanitize_numericalstability {
+; CHECK-LABEL: @swap_ft2(
+; CHECK-NEXT:    [[PV:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP1]], i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8* [[TMP2]], null
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP4:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to double*
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, double* [[TMP5]], align 1
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext float [[PV]] to double
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi double [ [[TMP6]], [[TMP4]] ], [ [[TMP8]], [[TMP7]] ]
+; CHECK-NEXT:    [[QV:%.*]] = load float, float* [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP11]], i64 1)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i8* [[TMP12]], null
+; CHECK-NEXT:    br i1 [[TMP13]], label [[TMP17:%.*]], label [[TMP14:%.*]]
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP12]] to double*
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, double* [[TMP15]], align 1
+; CHECK-NEXT:    br label [[TMP19:%.*]]
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = fpext float [[QV]] to double
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP16]], [[TMP14]] ], [ [[TMP18]], [[TMP17]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP22:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP21]], i64 1)
+; CHECK-NEXT:    [[TMP23:%.*]] = ptrtoint float* [[Q]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @__nsan_internal_check_float_d(float [[PV]], double [[TMP10]], i32 4, i64 [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP24]], 1
+; CHECK-NEXT:    [[TMP26:%.*]] = fpext float [[PV]] to double
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], double [[TMP26]], double [[TMP10]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP22]] to double*
+; CHECK-NEXT:    store double [[TMP27]], double* [[TMP28]], align 1
+; CHECK-NEXT:    store float [[PV]], float* [[Q]], align 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast float* [[P]] to i8*
+; CHECK-NEXT:    [[TMP30:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP29]], i64 1)
+; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint float* [[P]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @__nsan_internal_check_float_d(float [[QV]], double [[TMP20]], i32 4, i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[TMP32]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = fpext float [[QV]] to double
+; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], double [[TMP34]], double [[TMP20]]
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast i8* [[TMP30]] to double*
+; CHECK-NEXT:    store double [[TMP35]], double* [[TMP36]], align 1
+; CHECK-NEXT:    store float [[QV]], float* [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %pv = load float, float* %p
+  %qv = load float, float* %q
+  store float %pv, float* %q, align 8
+  store float %qv, float* %p, align 8
+  ret void
+}
+
+define void @swap_vectorft1(<2 x float>* nonnull align 16 %p, <2 x float>* nonnull align 16 %q) sanitize_numericalstability {
+; CHECK-LABEL: @swap_vectorft1(
+; CHECK-NEXT:    [[QV:%.*]] = load <2 x float>, <2 x float>* [[Q:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP1]], i64 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8* [[TMP2]], null
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP4:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <2 x double>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 1
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext <2 x float> [[QV]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x double> [ [[TMP6]], [[TMP4]] ], [ [[TMP8]], [[TMP7]] ]
+; CHECK-NEXT:    [[PV:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x float>* [[P]] to i8*
+; CHECK-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP11]], i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i8* [[TMP12]], null
+; CHECK-NEXT:    br i1 [[TMP13]], label [[TMP17:%.*]], label [[TMP14:%.*]]
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP12]] to <2 x double>*
+; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 1
+; CHECK-NEXT:    br label [[TMP19:%.*]]
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = fpext <2 x float> [[PV]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <2 x double> [ [[TMP16]], [[TMP14]] ], [ [[TMP18]], [[TMP17]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x float>* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP22:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP21]], i64 2)
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x float> [[PV]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = ptrtoint <2 x float>* [[Q]] to i64
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP23]], double [[TMP24]], i32 4, i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x float> [[PV]], i64 1
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[TMP20]], i64 1
+; CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint <2 x float>* [[Q]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP27]], double [[TMP28]], i32 4, i64 [[TMP29]])
+; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP26]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = fpext <2 x float> [[PV]] to <2 x double>
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP32]], <2 x double> [[TMP33]], <2 x double> [[TMP20]]
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast i8* [[TMP22]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP34]], <2 x double>* [[TMP35]], align 1
+; CHECK-NEXT:    store <2 x float> [[PV]], <2 x float>* [[Q]], align 16
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x float>* [[P]] to i8*
+; CHECK-NEXT:    [[TMP37:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP36]], i64 2)
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[QV]], i64 0
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x double> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint <2 x float>* [[P]] to i64
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP38]], double [[TMP39]], i32 4, i64 [[TMP40]])
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[QV]], i64 1
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x double> [[TMP10]], i64 1
+; CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint <2 x float>* [[P]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP42]], double [[TMP43]], i32 4, i64 [[TMP44]])
+; CHECK-NEXT:    [[TMP46:%.*]] = or i32 [[TMP41]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i32 [[TMP46]], 1
+; CHECK-NEXT:    [[TMP48:%.*]] = fpext <2 x float> [[QV]] to <2 x double>
+; CHECK-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], <2 x double> [[TMP48]], <2 x double> [[TMP10]]
+; CHECK-NEXT:    [[TMP50:%.*]] = bitcast i8* [[TMP37]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP49]], <2 x double>* [[TMP50]], align 1
+; CHECK-NEXT:    store <2 x float> [[QV]], <2 x float>* [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  %qv = load <2 x float>, <2 x float>* %q
+  %pv = load <2 x float>, <2 x float>* %p
+  store <2 x float> %pv, <2 x float>* %q, align 16
+  store <2 x float> %qv, <2 x float>* %p, align 16
+  ret void
+}
+
+; Same as swap_vectorft1, but the load/stores are in the opposite order.
+define void @swap_vectorft2(<2 x float>* nonnull align 16 %p, <2 x float>* nonnull align 16 %q) sanitize_numericalstability {
+; CHECK-LABEL: @swap_vectorft2(
+; CHECK-NEXT:    [[PV:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP1]], i64 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8* [[TMP2]], null
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP4:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <2 x double>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 1
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = fpext <2 x float> [[PV]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x double> [ [[TMP6]], [[TMP4]] ], [ [[TMP8]], [[TMP7]] ]
+; CHECK-NEXT:    [[QV:%.*]] = load <2 x float>, <2 x float>* [[Q:%.*]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x float>* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP12:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_load(i8* [[TMP11]], i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i8* [[TMP12]], null
+; CHECK-NEXT:    br i1 [[TMP13]], label [[TMP17:%.*]], label [[TMP14:%.*]]
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP12]] to <2 x double>*
+; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 1
+; CHECK-NEXT:    br label [[TMP19:%.*]]
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = fpext <2 x float> [[QV]] to <2 x double>
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <2 x double> [ [[TMP16]], [[TMP14]] ], [ [[TMP18]], [[TMP17]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x float>* [[Q]] to i8*
+; CHECK-NEXT:    [[TMP22:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP21]], i64 2)
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x float> [[PV]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = ptrtoint <2 x float>* [[Q]] to i64
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP23]], double [[TMP24]], i32 4, i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x float> [[PV]], i64 1
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[TMP10]], i64 1
+; CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint <2 x float>* [[Q]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP27]], double [[TMP28]], i32 4, i64 [[TMP29]])
+; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP26]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = fpext <2 x float> [[PV]] to <2 x double>
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP32]], <2 x double> [[TMP33]], <2 x double> [[TMP10]]
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast i8* [[TMP22]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP34]], <2 x double>* [[TMP35]], align 1
+; CHECK-NEXT:    store <2 x float> [[PV]], <2 x float>* [[Q]], align 16
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast <2 x float>* [[P]] to i8*
+; CHECK-NEXT:    [[TMP37:%.*]] = call i8* @__nsan_get_shadow_ptr_for_float_store(i8* [[TMP36]], i64 2)
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[QV]], i64 0
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x double> [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP40:%.*]] = ptrtoint <2 x float>* [[P]] to i64
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP38]], double [[TMP39]], i32 4, i64 [[TMP40]])
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[QV]], i64 1
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x double> [[TMP20]], i64 1
+; CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint <2 x float>* [[P]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @__nsan_internal_check_float_d(float [[TMP42]], double [[TMP43]], i32 4, i64 [[TMP44]])
+; CHECK-NEXT:    [[TMP46:%.*]] = or i32 [[TMP41]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i32 [[TMP46]], 1
+; CHECK-NEXT:    [[TMP48:%.*]] = fpext <2 x float> [[QV]] to <2 x double>
+; CHECK-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], <2 x double> [[TMP48]], <2 x double> [[TMP20]]
+; CHECK-NEXT:    [[TMP50:%.*]] = bitcast i8* [[TMP37]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP49]], <2 x double>* [[TMP50]], align 1
+; CHECK-NEXT:    store <2 x float> [[QV]], <2 x float>* [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  %pv = load <2 x float>, <2 x float>* %p
+  %qv = load <2 x float>, <2 x float>* %q
+  store <2 x float> %pv, <2 x float>* %q, align 16
+  store <2 x float> %qv, <2 x float>* %p, align 16
+  ret void
+}