diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1209,7 +1209,9 @@
   set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "Don't install benchmark" FORCE)
   set(BENCHMARK_DOWNLOAD_DEPENDENCIES OFF CACHE BOOL "Don't download dependencies" FORCE)
   set(BENCHMARK_ENABLE_GTEST_TESTS OFF CACHE BOOL "Disable Google Test in benchmark" FORCE)
-  # Since LLVM requires C++11 it is safe to assume that std::regex is available.
+  set(BENCHMARK_ENABLE_WERROR ${LLVM_ENABLE_WERROR} CACHE BOOL 
+    "Handle -Werror for Google Benchmark based on LLVM_ENABLE_WERROR" FORCE)
+    # Since LLVM requires C++11 it is safe to assume that std::regex is available.
   set(HAVE_STD_REGEX ON CACHE BOOL "OK" FORCE)
   add_subdirectory(${LLVM_THIRD_PARTY_DIR}/benchmark 
     ${CMAKE_CURRENT_BINARY_DIR}/third-party/benchmark)
diff --git a/third-party/benchmark/.clang-format b/third-party/benchmark/.clang-format
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+PointerAlignment: Left
+...
diff --git a/third-party/benchmark/.clang-tidy b/third-party/benchmark/.clang-tidy
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/.clang-tidy
@@ -0,0 +1,7 @@
+---
+Checks:          'clang-analyzer-*,readability-redundant-*,performance-*'
+WarningsAsErrors: 'clang-analyzer-*,readability-redundant-*,performance-*'
+HeaderFilterRegex: '.*'
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+User:            user
diff --git a/third-party/benchmark/.travis.yml b/third-party/benchmark/.travis.yml
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/.travis.yml
@@ -0,0 +1,208 @@
+sudo: required
+dist: trusty
+language: cpp
+
+matrix:
+  include:
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - lcov
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - g++-multilib
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Debug
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - g++-multilib
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+    - compiler: gcc
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
+    # Clang w/ libc++
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+        - LIBCXX_BUILD=1
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ 32bit libc++
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            - clang-3.8
+            - g++-multilib
+            - libc6:i386
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ 32bit libc++
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            - clang-3.8
+            - g++-multilib
+            - libc6:i386
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+        - LIBCXX_BUILD=1
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ libc++, ASAN, UBSAN
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+        - UBSAN_OPTIONS=print_stacktrace=1
+    # Clang w/ libc++ and MSAN
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ libc++ and MSAN
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+
+before_script:
+  - if [ -n "${LIBCXX_BUILD}" ]; then
+      source .libcxx-setup.sh;
+    fi
+  - if [ -n "${ENABLE_SANITIZER}" ]; then
+      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
+    else
+      export EXTRA_OPTIONS="";
+    fi
+  - mkdir -p build && cd build
+
+before_install:
+  - if [ -z "$BUILD_32_BITS" ]; then
+      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
+    fi
+  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
+      sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test";
+      sudo apt-get update --option Acquire::Retries=100 --option Acquire::http::Timeout="60";
+    fi
+
+install:
+  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
+      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
+    fi
+  - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
+      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
+      sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
+    fi
+  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
+      PATH=~/.local/bin:${PATH};
+      pip install --user --upgrade pip;
+      travis_wait pip install --user cpp-coveralls;
+    fi
+  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
+      rm -f /usr/local/include/c++;
+      brew update;
+      travis_wait brew install gcc@7;
+    fi
+  - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
+      sudo apt-get update -qq;
+      sudo apt-get install -qq unzip cmake3;
+      wget https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-linux-x86_64.sh --output-document bazel-installer.sh;
+      travis_wait sudo bash bazel-installer.sh;
+    fi
+  - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
+      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-darwin-x86_64.sh;
+      travis_wait sudo bash bazel-installer.sh;
+    fi
+
+script:
+  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_C_FLAGS="${EXTRA_FLAGS}" -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS} ${EXTRA_CXX_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
+  - make
+  - ctest -C ${BUILD_TYPE} --output-on-failure
+  - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
+
+after_success:
+  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
+      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
+    fi
diff --git a/third-party/benchmark/.ycm_extra_conf.py b/third-party/benchmark/.ycm_extra_conf.py
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/.ycm_extra_conf.py
@@ -0,0 +1,115 @@
+import os
+import ycm_core
+
+# These are the compilation flags that will be used in case there's no
+# compilation database set (by default, one is not set).
+# CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
+flags = [
+'-Wall',
+'-Werror',
+'-pedantic-errors',
+'-std=c++0x',
+'-fno-strict-aliasing',
+'-O3',
+'-DNDEBUG',
+# ...and the same thing goes for the magic -x option which specifies the
+# language that the files to be compiled are written in. This is mostly
+# relevant for c++ headers.
+# For a C project, you would set this to 'c' instead of 'c++'.
+'-x', 'c++',
+'-I', 'include',
+'-isystem', '/usr/include',
+'-isystem', '/usr/local/include',
+]
+
+
+# Set this to the absolute path to the folder (NOT the file!) containing the
+# compile_commands.json file to use that instead of 'flags'. See here for
+# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html
+#
+# Most projects will NOT need to set this to anything; you can just change the
+# 'flags' list of compilation flags. Notice that YCM itself uses that approach.
+compilation_database_folder = ''
+
+if os.path.exists( compilation_database_folder ):
+  database = ycm_core.CompilationDatabase( compilation_database_folder )
+else:
+  database = None
+
+SOURCE_EXTENSIONS = [ '.cc' ]
+
+def DirectoryOfThisScript():
+  return os.path.dirname( os.path.abspath( __file__ ) )
+
+
+def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
+  if not working_directory:
+    return list( flags )
+  new_flags = []
+  make_next_absolute = False
+  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
+  for flag in flags:
+    new_flag = flag
+
+    if make_next_absolute:
+      make_next_absolute = False
+      if not flag.startswith( '/' ):
+        new_flag = os.path.join( working_directory, flag )
+
+    for path_flag in path_flags:
+      if flag == path_flag:
+        make_next_absolute = True
+        break
+
+      if flag.startswith( path_flag ):
+        path = flag[ len( path_flag ): ]
+        new_flag = path_flag + os.path.join( working_directory, path )
+        break
+
+    if new_flag:
+      new_flags.append( new_flag )
+  return new_flags
+
+
+def IsHeaderFile( filename ):
+  extension = os.path.splitext( filename )[ 1 ]
+  return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
+
+
+def GetCompilationInfoForFile( filename ):
+  # The compilation_commands.json file generated by CMake does not have entries
+  # for header files. So we do our best by asking the db for flags for a
+  # corresponding source file, if any. If one exists, the flags for that file
+  # should be good enough.
+  if IsHeaderFile( filename ):
+    basename = os.path.splitext( filename )[ 0 ]
+    for extension in SOURCE_EXTENSIONS:
+      replacement_file = basename + extension
+      if os.path.exists( replacement_file ):
+        compilation_info = database.GetCompilationInfoForFile(
+          replacement_file )
+        if compilation_info.compiler_flags_:
+          return compilation_info
+    return None
+  return database.GetCompilationInfoForFile( filename )
+
+
+def FlagsForFile( filename, **kwargs ):
+  if database:
+    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+    # python list, but a "list-like" StringVec object
+    compilation_info = GetCompilationInfoForFile( filename )
+    if not compilation_info:
+      return None
+
+    final_flags = MakeRelativePathsInFlagsAbsolute(
+      compilation_info.compiler_flags_,
+      compilation_info.compiler_working_dir_ )
+  else:
+    relative_to = DirectoryOfThisScript()
+    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+
+  return {
+    'flags': final_flags,
+    'do_cache': True
+  }
diff --git a/third-party/benchmark/AUTHORS b/third-party/benchmark/AUTHORS
--- a/third-party/benchmark/AUTHORS
+++ b/third-party/benchmark/AUTHORS
@@ -21,6 +21,8 @@
 Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
diff --git a/third-party/benchmark/BUILD.bazel b/third-party/benchmark/BUILD.bazel
--- a/third-party/benchmark/BUILD.bazel
+++ b/third-party/benchmark/BUILD.bazel
@@ -1,9 +1,17 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
 licenses(["notice"])
 
+config_setting(
+    name = "qnx",
+    constraint_values = ["@platforms//os:qnx"],
+    values = {
+        "cpu": "x64_qnx",
+    },
+    visibility = [":__subpackages__"],
+)
+
 config_setting(
     name = "windows",
+    constraint_values = ["@platforms//os:windows"],
     values = {
         "cpu": "x64_windows",
     },
diff --git a/third-party/benchmark/CMakeLists.txt b/third-party/benchmark/CMakeLists.txt
--- a/third-party/benchmark/CMakeLists.txt
+++ b/third-party/benchmark/CMakeLists.txt
@@ -13,18 +13,31 @@
   endif()
 endforeach()
 
-project (benchmark VERSION 1.5.4 LANGUAGES CXX)
+project (benchmark VERSION 1.6.0 LANGUAGES CXX)
 
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+option(BENCHMARK_ENABLE_WERROR "Build Release candidates with -Werror." ON)
+option(BENCHMARK_FORCE_WERROR "Build Release candidates with -Werror regardless of compiler issues." OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  # PGC++ maybe reporting false positives.
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if(BENCHMARK_FORCE_WERROR)
+  set(BENCHMARK_ENABLE_WERROR ON)
+endif(BENCHMARK_FORCE_WERROR)
+
 if(NOT MSVC)
   option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
   set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
 endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
+option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)
 
 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
 # may require downloading the source code.
@@ -33,6 +46,7 @@
 # This option can be used to disable building and running unit tests which depend on gtest
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the find_package(GTest) will be used." ON)
 
 option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
 
@@ -112,6 +126,9 @@
 include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
 include(CXXFeatureCheck)
+include(CheckLibraryExists)
+
+check_library_exists(rt shm_open "" HAVE_LIB_RT)
 
 if (BENCHMARK_BUILD_32_BITS)
   add_required_cxx_compiler_flag(-m32)
@@ -160,9 +177,11 @@
   add_cxx_compiler_flag(-Wall)
   add_cxx_compiler_flag(-Wextra)
   add_cxx_compiler_flag(-Wshadow)
-  add_cxx_compiler_flag(-Werror RELEASE)
-  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Werror MINSIZEREL)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Werror RELEASE)
+      add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
+      add_cxx_compiler_flag(-Werror MINSIZEREL)
+  endif()
   if (NOT BENCHMARK_ENABLE_TESTING)
     # Disable warning when compiling tests as gtest does not use 'override'.
     add_cxx_compiler_flag(-Wsuggest-override)
@@ -181,9 +200,11 @@
     add_cxx_compiler_flag(-wd1786)
   endif()
   # Disable deprecation warnings for release builds (when -Werror is enabled).
-  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Wno-deprecated RELEASE)
+      add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
+      add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+  endif()
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-fno-exceptions)
   endif()
@@ -307,7 +328,15 @@
   if (BENCHMARK_ENABLE_GTEST_TESTS AND
       NOT (TARGET gtest AND TARGET gtest_main AND
            TARGET gmock AND TARGET gmock_main))
-    include(GoogleTest)
+    if (BENCHMARK_USE_BUNDLED_GTEST)
+      include(GoogleTest)
+    else()
+      find_package(GTest CONFIG REQUIRED)
+      add_library(gtest ALIAS GTest::gtest)
+      add_library(gtest_main ALIAS GTest::gtest_main)
+      add_library(gmock ALIAS GTest::gmock)
+      add_library(gmock_main ALIAS GTest::gmock_main)
+    endif()
   endif()
   add_subdirectory(test)
 endif()
diff --git a/third-party/benchmark/CONTRIBUTORS b/third-party/benchmark/CONTRIBUTORS
--- a/third-party/benchmark/CONTRIBUTORS
+++ b/third-party/benchmark/CONTRIBUTORS
@@ -38,6 +38,8 @@
 Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
diff --git a/third-party/benchmark/README.md b/third-party/benchmark/README.md
--- a/third-party/benchmark/README.md
+++ b/third-party/benchmark/README.md
@@ -27,14 +27,16 @@
 BENCHMARK_MAIN();
 ```
 
+## Getting Started
+
 To get started, see [Requirements](#requirements) and
 [Installation](#installation). See [Usage](#usage) for a full example and the
-[User Guide](#user-guide) for a more comprehensive feature overview.
+[User Guide](docs/user_guide.md) for a more comprehensive feature overview.
 
 It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/docs/primer.md)
 as some of the structural aspects of the APIs are similar.
 
-### Resources
+## Resources
 
 [Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
 
@@ -57,27 +59,25 @@
 * Visual Studio 14 2015
 * Intel 2015 Update 1
 
-See [Platform-Specific Build Instructions](#platform-specific-build-instructions).
+See [Platform-Specific Build Instructions](docs/platform_specific_build_instructions.md).
 
 ## Installation
 
 This describes the installation process using cmake. As pre-requisites, you'll
 need git and cmake installed.
 
-_See [dependencies.md](dependencies.md) for more details regarding supported
+_See [dependencies.md](docs/dependencies.md) for more details regarding supported
 versions of build tools._
 
 ```bash
 # Check out the library.
 $ git clone https://github.com/google/benchmark.git
-# Benchmark requires Google Test as a dependency. Add the source tree as a subdirectory.
-$ git clone https://github.com/google/googletest.git benchmark/googletest
 # Go to the library root directory
 $ cd benchmark
 # Make a build directory to place the build output.
 $ cmake -E make_directory "build"
-# Generate build system files with cmake.
-$ cmake -E chdir "build" cmake -DCMAKE_BUILD_TYPE=Release ../
+# Generate build system files with cmake, and download any dependencies.
+$ cmake -E chdir "build" cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release ../
 # or, starting with CMake 3.13, use a simpler form:
 # cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
 # Build the library.
@@ -111,10 +111,10 @@
 Note that Google Benchmark requires Google Test to build and run the tests. This
 dependency can be provided two ways:
 
-* Checkout the Google Test sources into `benchmark/googletest` as above.
+* Checkout the Google Test sources into `benchmark/googletest`.
 * Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
-  configuration, the library will automatically download and build any required
-  dependencies.
+  configuration as above, the library will automatically download and build
+  any required dependencies.
 
 If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
 to `CMAKE_ARGS`.
@@ -193,7 +193,7 @@
 `BENCHMARK_MAIN();` above to get the same behavior.
 
 The compiled executable will run all benchmarks by default. Pass the `--help`
-flag for option information or see the guide below.
+flag for option information or see the [User Guide](docs/user_guide.md).
 
 ### Usage with CMake
 
@@ -214,1165 +214,3 @@
 ```cmake
 target_link_libraries(MyTarget benchmark::benchmark)
 ```
-
-## Platform Specific Build Instructions
-
-### Building with GCC
-
-When the library is built using GCC it is necessary to link with the pthread
-library due to how GCC implements `std::thread`. Failing to link to pthread will
-lead to runtime exceptions (unless you're using libc++), not linker errors. See
-[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
-can link to pthread by adding `-pthread` to your linker command. Note, you can
-also use `-lpthread`, but there are potential issues with ordering of command
-line parameters if you use that.
-
-### Building with Visual Studio 2015 or 2017
-
-The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
-
-```
-// Alternatively, can add libraries using linker options.
-#ifdef _WIN32
-#pragma comment ( lib, "Shlwapi.lib" )
-#ifdef _DEBUG
-#pragma comment ( lib, "benchmarkd.lib" )
-#else
-#pragma comment ( lib, "benchmark.lib" )
-#endif
-#endif
-```
-
-Can also use the graphical version of CMake:
-* Open `CMake GUI`.
-* Under `Where to build the binaries`, same path as source plus `build`.
-* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
-* Click `Configure`, `Generate`, `Open Project`.
-* If build fails, try deleting entire directory and starting again, or unticking options to build less.
-
-### Building with Intel 2015 Update 1 or Intel System Studio Update 4
-
-See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
-
-### Building on Solaris
-
-If you're running benchmarks on solaris, you'll want the kstat library linked in
-too (`-lkstat`).
-
-## User Guide
-
-### Command Line
-
-[Output Formats](#output-formats)
-
-[Output Files](#output-files)
-
-[Running Benchmarks](#running-benchmarks)
-
-[Running a Subset of Benchmarks](#running-a-subset-of-benchmarks)
-
-[Result Comparison](#result-comparison)
-
-[Extra Context](#extra-context)
-
-### Library
-
-[Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
-
-[Passing Arguments](#passing-arguments)
-
-[Custom Benchmark Name](#custom-benchmark-name)
-
-[Calculating Asymptotic Complexity](#asymptotic-complexity)
-
-[Templated Benchmarks](#templated-benchmarks)
-
-[Fixtures](#fixtures)
-
-[Custom Counters](#custom-counters)
-
-[Multithreaded Benchmarks](#multithreaded-benchmarks)
-
-[CPU Timers](#cpu-timers)
-
-[Manual Timing](#manual-timing)
-
-[Setting the Time Unit](#setting-the-time-unit)
-
-[Random Interleaving](docs/random_interleaving.md)
-
-[User-Requested Performance Counters](docs/perf_counters.md)
-
-[Preventing Optimization](#preventing-optimization)
-
-[Reporting Statistics](#reporting-statistics)
-
-[Custom Statistics](#custom-statistics)
-
-[Using RegisterBenchmark](#using-register-benchmark)
-
-[Exiting with an Error](#exiting-with-an-error)
-
-[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
-
-[Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
-
-
-<a name="output-formats" />
-
-### Output Formats
-
-The library supports multiple output formats. Use the
-`--benchmark_format=<console|json|csv>` flag (or set the
-`BENCHMARK_FORMAT=<console|json|csv>` environment variable) to set
-the format type. `console` is the default format.
-
-The Console format is intended to be a human readable format. By default
-the format generates color output. Context is output on stderr and the
-tabular data on stdout. Example tabular output looks like:
-
-```
-Benchmark                               Time(ns)    CPU(ns) Iterations
-----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
-```
-
-The JSON format outputs human readable json split into two top level attributes.
-The `context` attribute contains information about the run in general, including
-information about the CPU and the date.
-The `benchmarks` attribute contains a list of every benchmark run. Example json
-output looks like:
-
-```json
-{
-  "context": {
-    "date": "2015/03/17-18:40:25",
-    "num_cpus": 40,
-    "mhz_per_cpu": 2801,
-    "cpu_scaling_enabled": false,
-    "build_type": "debug"
-  },
-  "benchmarks": [
-    {
-      "name": "BM_SetInsert/1024/1",
-      "iterations": 94877,
-      "real_time": 29275,
-      "cpu_time": 29836,
-      "bytes_per_second": 134066,
-      "items_per_second": 33516
-    },
-    {
-      "name": "BM_SetInsert/1024/8",
-      "iterations": 21609,
-      "real_time": 32317,
-      "cpu_time": 32429,
-      "bytes_per_second": 986770,
-      "items_per_second": 246693
-    },
-    {
-      "name": "BM_SetInsert/1024/10",
-      "iterations": 21393,
-      "real_time": 32724,
-      "cpu_time": 33355,
-      "bytes_per_second": 1199226,
-      "items_per_second": 299807
-    }
-  ]
-}
-```
-
-The CSV format outputs comma-separated values. The `context` is output on stderr
-and the CSV itself on stdout. Example CSV output looks like:
-
-```
-name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
-"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
-"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
-"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
-```
-
-<a name="output-files" />
-
-### Output Files
-
-Write benchmark results to a file with the `--benchmark_out=<filename>` option
-(or set `BENCHMARK_OUT`). Specify the output format with
-`--benchmark_out_format={json|console|csv}` (or set
-`BENCHMARK_OUT_FORMAT={json|console|csv}`). Note that the 'csv' reporter is
-deprecated and the saved `.csv` file 
-[is not parsable](https://github.com/google/benchmark/issues/794) by csv 
-parsers.
-
-Specifying `--benchmark_out` does not suppress the console output.
-
-<a name="running-benchmarks" />
-
-### Running Benchmarks
-
-Benchmarks are executed by running the produced binaries. Benchmarks binaries,
-by default, accept options that may be specified either through their command
-line interface or by setting environment variables before execution. For every
-`--option_flag=<value>` CLI switch, a corresponding environment variable
-`OPTION_FLAG=<value>` exist and is used as default if set (CLI switches always
- prevails). A complete list of CLI options is available running benchmarks
- with the `--help` switch.
-
-<a name="running-a-subset-of-benchmarks" />
-
-### Running a Subset of Benchmarks
-
-The `--benchmark_filter=<regex>` option (or `BENCHMARK_FILTER=<regex>`
-environment variable) can be used to only run the benchmarks that match
-the specified `<regex>`. For example:
-
-```bash
-$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
-Run on (1 X 2300 MHz CPU )
-2016-06-25 19:34:24
-Benchmark              Time           CPU Iterations
-----------------------------------------------------
-BM_memcpy/32          11 ns         11 ns   79545455
-BM_memcpy/32k       2181 ns       2185 ns     324074
-BM_memcpy/32          12 ns         12 ns   54687500
-BM_memcpy/32k       1834 ns       1837 ns     357143
-```
-
-<a name="result-comparison" />
-
-### Result comparison
-
-It is possible to compare the benchmarking results.
-See [Additional Tooling Documentation](docs/tools.md)
-
-<a name="extra-context" />
-
-### Extra Context
-
-Sometimes it's useful to add extra context to the content printed before the
-results. By default this section includes information about the CPU on which
-the benchmarks are running. If you do want to add more context, you can use
-the `benchmark_context` command line flag:
-
-```bash
-$ ./run_benchmarks --benchmark_context=pwd=`pwd`
-Run on (1 x 2300 MHz CPU)
-pwd: /home/user/benchmark/
-Benchmark              Time           CPU Iterations
-----------------------------------------------------
-BM_memcpy/32          11 ns         11 ns   79545455
-BM_memcpy/32k       2181 ns       2185 ns     324074
-```
-
-You can get the same effect with the API:
-
-```c++
-  benchmark::AddCustomContext("foo", "bar");
-```
-
-Note that attempts to add a second value with the same key will fail with an
-error message.
-
-<a name="runtime-and-reporting-considerations" />
-
-### Runtime and Reporting Considerations
-
-When the benchmark binary is executed, each benchmark function is run serially.
-The number of iterations to run is determined dynamically by running the
-benchmark a few times and measuring the time taken and ensuring that the
-ultimate result will be statistically stable. As such, faster benchmark
-functions will be run for more iterations than slower benchmark functions, and
-the number of iterations is thus reported.
-
-In all cases, the number of iterations for which the benchmark is run is
-governed by the amount of time the benchmark takes. Concretely, the number of
-iterations is at least one, not more than 1e9, until CPU time is greater than
-the minimum time, or the wallclock time is 5x minimum time. The minimum time is
-set per benchmark by calling `MinTime` on the registered benchmark object.
-
-Average timings are then reported over the iterations run. If multiple
-repetitions are requested using the `--benchmark_repetitions` command-line
-option, or at registration time, the benchmark function will be run several
-times and statistical results across these repetitions will also be reported.
-
-As well as the per-benchmark entries, a preamble in the report will include
-information about the machine on which the benchmarks are run.
-
-<a name="passing-arguments" />
-
-### Passing Arguments
-
-Sometimes a family of benchmarks can be implemented with just one routine that
-takes an extra argument to specify which one of the family of benchmarks to
-run. For example, the following code defines a family of benchmarks for
-measuring the speed of `memcpy()` calls of different lengths:
-
-```c++
-static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range(0)];
-  char* dst = new char[state.range(0)];
-  memset(src, 'x', state.range(0));
-  for (auto _ : state)
-    memcpy(dst, src, state.range(0));
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-  delete[] src;
-  delete[] dst;
-}
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
-```
-
-The preceding code is quite repetitive, and can be replaced with the following
-short-hand. The following invocation will pick a few appropriate arguments in
-the specified range and will generate a benchmark for each such argument.
-
-```c++
-BENCHMARK(BM_memcpy)->Range(8, 8<<10);
-```
-
-By default the arguments in the range are generated in multiples of eight and
-the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the
-range multiplier is changed to multiples of two.
-
-```c++
-BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
-```
-
-Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
-
-The preceding code shows a method of defining a sparse range.  The following
-example shows a method of defining a dense range. It is then used to benchmark
-the performance of `std::vector` initialization for uniformly increasing sizes.
-
-```c++
-static void BM_DenseRange(benchmark::State& state) {
-  for(auto _ : state) {
-    std::vector<int> v(state.range(0), state.range(0));
-    benchmark::DoNotOptimize(v.data());
-    benchmark::ClobberMemory();
-  }
-}
-BENCHMARK(BM_DenseRange)->DenseRange(0, 1024, 128);
-```
-
-Now arguments generated are [ 0, 128, 256, 384, 512, 640, 768, 896, 1024 ].
-
-You might have a benchmark that depends on two or more inputs. For example, the
-following code defines a family of benchmarks for measuring the speed of set
-insertion.
-
-```c++
-static void BM_SetInsert(benchmark::State& state) {
-  std::set<int> data;
-  for (auto _ : state) {
-    state.PauseTiming();
-    data = ConstructRandomSet(state.range(0));
-    state.ResumeTiming();
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert)
-    ->Args({1<<10, 128})
-    ->Args({2<<10, 128})
-    ->Args({4<<10, 128})
-    ->Args({8<<10, 128})
-    ->Args({1<<10, 512})
-    ->Args({2<<10, 512})
-    ->Args({4<<10, 512})
-    ->Args({8<<10, 512});
-```
-
-The preceding code is quite repetitive, and can be replaced with the following
-short-hand. The following macro will pick a few appropriate arguments in the
-product of the two specified ranges and will generate a benchmark for each such
-pair.
-
-```c++
-BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
-```
-
-Some benchmarks may require specific argument values that cannot be expressed
-with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
-benchmark input for each combination in the product of the supplied vectors.
-
-```c++
-BENCHMARK(BM_SetInsert)
-    ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
-// would generate the same benchmark arguments as
-BENCHMARK(BM_SetInsert)
-    ->Args({1<<10, 20})
-    ->Args({3<<10, 20})
-    ->Args({8<<10, 20})
-    ->Args({3<<10, 40})
-    ->Args({8<<10, 40})
-    ->Args({1<<10, 40})
-    ->Args({1<<10, 60})
-    ->Args({3<<10, 60})
-    ->Args({8<<10, 60})
-    ->Args({1<<10, 80})
-    ->Args({3<<10, 80})
-    ->Args({8<<10, 80});
-```
-
-For more complex patterns of inputs, passing a custom function to `Apply` allows
-programmatic specification of an arbitrary set of arguments on which to run the
-benchmark. The following example enumerates a dense range on one parameter,
-and a sparse range on the second.
-
-```c++
-static void CustomArguments(benchmark::internal::Benchmark* b) {
-  for (int i = 0; i <= 10; ++i)
-    for (int j = 32; j <= 1024*1024; j *= 8)
-      b->Args({i, j});
-}
-BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
-```
-
-#### Passing Arbitrary Arguments to a Benchmark
-
-In C++11 it is possible to define a benchmark that takes an arbitrary number
-of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
-macro creates a benchmark that invokes `func`  with the `benchmark::State` as
-the first argument followed by the specified `args...`.
-The `test_case_name` is appended to the name of the benchmark and
-should describe the values passed.
-
-```c++
-template <class ...ExtraArgs>
-void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-  [...]
-}
-// Registers a benchmark named "BM_takes_args/int_string_test" that passes
-// the specified values to `extra_args`.
-BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
-```
-
-Note that elements of `...args` may refer to global variables. Users should
-avoid modifying global state inside of a benchmark.
-
-<a name="asymptotic-complexity" />
-
-### Calculating Asymptotic Complexity (Big O)
-
-Asymptotic complexity might be calculated for a family of benchmarks. The
-following code will calculate the coefficient for the high-order term in the
-running time and the normalized root-mean square error of string comparison.
-
-```c++
-static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range(0), '-');
-  std::string s2(state.range(0), '-');
-  for (auto _ : state) {
-    benchmark::DoNotOptimize(s1.compare(s2));
-  }
-  state.SetComplexityN(state.range(0));
-}
-BENCHMARK(BM_StringCompare)
-    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
-```
-
-As shown in the following invocation, asymptotic complexity might also be
-calculated automatically.
-
-```c++
-BENCHMARK(BM_StringCompare)
-    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity();
-```
-
-The following code will specify asymptotic complexity with a lambda function,
-that might be used to customize high-order term calculation.
-
-```c++
-BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
-    ->Range(1<<10, 1<<18)->Complexity([](benchmark::IterationCount n)->double{return n; });
-```
-
-<a name="custom-benchmark-name" />
-
-### Custom Benchmark Name
-
-You can change the benchmark's name as follows:
-
-```c++
-BENCHMARK(BM_memcpy)->Name("memcpy")->RangeMultiplier(2)->Range(8, 8<<10);
-```
-
-The invocation will execute the benchmark as before using `BM_memcpy` but changes
-the prefix in the report to `memcpy`.
-
-<a name="templated-benchmarks" />
-
-### Templated Benchmarks
-
-This example produces and consumes messages of size `sizeof(v)` `range_x`
-times. It also outputs throughput in the absence of multiprogramming.
-
-```c++
-template <class Q> void BM_Sequential(benchmark::State& state) {
-  Q q;
-  typename Q::value_type v;
-  for (auto _ : state) {
-    for (int i = state.range(0); i--; )
-      q.push(v);
-    for (int e = state.range(0); e--; )
-      q.Wait(&v);
-  }
-  // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range(0));
-}
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
-```
-
-Three macros are provided for adding benchmark templates.
-
-```c++
-#ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
-#else // C++ < C++11
-#define BENCHMARK_TEMPLATE(func, arg1)
-#endif
-#define BENCHMARK_TEMPLATE1(func, arg1)
-#define BENCHMARK_TEMPLATE2(func, arg1, arg2)
-```
-
-<a name="fixtures" />
-
-### Fixtures
-
-Fixture tests are created by first defining a type that derives from
-`::benchmark::Fixture` and then creating/registering the tests using the
-following macros:
-
-* `BENCHMARK_F(ClassName, Method)`
-* `BENCHMARK_DEFINE_F(ClassName, Method)`
-* `BENCHMARK_REGISTER_F(ClassName, Method)`
-
-For Example:
-
-```c++
-class MyFixture : public benchmark::Fixture {
-public:
-  void SetUp(const ::benchmark::State& state) {
-  }
-
-  void TearDown(const ::benchmark::State& state) {
-  }
-};
-
-BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-/* BarTest is NOT registered */
-BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
-```
-
-#### Templated Fixtures
-
-Also you can create templated fixture by using the following macros:
-
-* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
-* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
-
-For example:
-
-```c++
-template<typename T>
-class MyFixture : public benchmark::Fixture {};
-
-BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
-```
-
-<a name="custom-counters" />
-
-### Custom Counters
-
-You can add your own counters with user-defined names. The example below
-will add columns "Foo", "Bar" and "Baz" in its output:
-
-```c++
-static void UserCountersExample1(benchmark::State& state) {
-  double numFoos = 0, numBars = 0, numBazs = 0;
-  for (auto _ : state) {
-    // ... count Foo,Bar,Baz events
-  }
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-}
-```
-
-The `state.counters` object is a `std::map` with `std::string` keys
-and `Counter` values. The latter is a `double`-like class, via an implicit
-conversion to `double&`. Thus you can use all of the standard arithmetic
-assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
-
-In multithreaded benchmarks, each counter is set on the calling thread only.
-When the benchmark finishes, the counters from each thread will be summed;
-the resulting sum is the value which will be shown for the benchmark.
-
-The `Counter` constructor accepts three parameters: the value as a `double`
-; a bit flag which allows you to show counters as rates, and/or as per-thread
-iteration, and/or as per-thread averages, and/or iteration invariants,
-and/or finally inverting the result; and a flag specifying the 'unit' - i.e.
-is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
-(`benchmark::Counter::OneK::kIs1024`)?
-
-```c++
-  // sets a simple counter
-  state.counters["Foo"] = numFoos;
-
-  // Set the counter as a rate. It will be presented divided
-  // by the duration of the benchmark.
-  // Meaning: per one second, how many 'foo's are processed?
-  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
-
-  // Set the counter as a rate. It will be presented divided
-  // by the duration of the benchmark, and the result inverted.
-  // Meaning: how many seconds it takes to process one 'foo'?
-  state.counters["FooInvRate"] = Counter(numFoos, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
-
-  // Set the counter as a thread-average quantity. It will
-  // be presented divided by the number of threads.
-  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
-
-  // There's also a combined flag:
-  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
-
-  // This says that we process with the rate of state.range(0) bytes every iteration:
-  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
-```
-
-When you're compiling in C++11 mode or later you can use `insert()` with
-`std::initializer_list`:
-
-```c++
-  // With C++11, this can be done:
-  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
-  // ... instead of:
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-```
-
-#### Counter Reporting
-
-When using the console reporter, by default, user counters are printed at
-the end after the table, the same way as ``bytes_processed`` and
-``items_processed``. This is best for cases in which there are few counters,
-or where there are only a couple of lines per benchmark. Here's an example of
-the default output:
-
-```
-------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations UserCounters...
-------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
-BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
-BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
-BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
-BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
-BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
-BM_Factorial                    26 ns         26 ns   26608979 40320
-BM_Factorial/real_time          26 ns         26 ns   26587936 40320
-BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
-BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
-BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
-BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
-```
-
-If this doesn't suit you, you can print each counter as a table column by
-passing the flag `--benchmark_counters_tabular=true` to the benchmark
-application. This is best for cases in which there are a lot of counters, or
-a lot of lines per individual benchmark. Note that this will trigger a
-reprinting of the table header any time the counter set changes between
-individual benchmarks. Here's an example of corresponding output when
-`--benchmark_counters_tabular=true` is passed:
-
-```
----------------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
----------------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
-BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
-BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
-BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
-BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
-BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
-BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
-BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
---------------------------------------------------------------
-Benchmark                        Time           CPU Iterations
---------------------------------------------------------------
-BM_Factorial                    26 ns         26 ns   26392245 40320
-BM_Factorial/real_time          26 ns         26 ns   26494107 40320
-BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
-BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
-BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
-BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
-BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
-BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
-BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
-BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
-BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
-```
-
-Note above the additional header printed when the benchmark changes from
-``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
-not have the same counter set as ``BM_UserCounter``.
-
-<a name="multithreaded-benchmarks"/>
-
-### Multithreaded Benchmarks
-
-In a multithreaded test (benchmark invoked by multiple threads simultaneously),
-it is guaranteed that none of the threads will start until all have reached
-the start of the benchmark loop, and all will have finished before any thread
-exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
-API) As such, any global setup or teardown can be wrapped in a check against the thread
-index:
-
-```c++
-static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // Setup code here.
-  }
-  for (auto _ : state) {
-    // Run the test as normal.
-  }
-  if (state.thread_index == 0) {
-    // Teardown code here.
-  }
-}
-BENCHMARK(BM_MultiThreaded)->Threads(2);
-```
-
-If the benchmarked code itself uses threads and you want to compare it to
-single-threaded code, you may want to use real-time ("wallclock") measurements
-for latency comparisons:
-
-```c++
-BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
-```
-
-Without `UseRealTime`, CPU time is used by default.
-
-<a name="cpu-timers" />
-
-### CPU Timers
-
-By default, the CPU timer only measures the time spent by the main thread.
-If the benchmark itself uses threads internally, this measurement may not
-be what you are looking for. Instead, there is a way to measure the total
-CPU usage of the process, by all the threads.
-
-```c++
-void callee(int i);
-
-static void MyMain(int size) {
-#pragma omp parallel for
-  for(int i = 0; i < size; i++)
-    callee(i);
-}
-
-static void BM_OpenMP(benchmark::State& state) {
-  for (auto _ : state)
-    MyMain(state.range(0));
-}
-
-// Measure the time spent by the main thread, use it to decide for how long to
-// run the benchmark loop. Depending on the internal implementation detail may
-// measure to anywhere from near-zero (the overhead spent before/after work
-// handoff to worker thread[s]) to the whole single-thread time.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
-
-// Measure the user-visible time, the wall clock (literally, the time that
-// has passed on the clock on the wall), use it to decide for how long to
-// run the benchmark loop. This will always be meaningful, an will match the
-// time spent by the main thread in single-threaded case, in general decreasing
-// with the number of internal threads doing the work.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
-
-// Measure the total CPU consumption, use it to decide for how long to
-// run the benchmark loop. This will always measure to no less than the
-// time spent by the main thread in single-threaded case.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
-
-// A mixture of the last two. Measure the total CPU consumption, but use the
-// wall clock to decide for how long to run the benchmark loop.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
-```
-
-#### Controlling Timers
-
-Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
-is measured. But sometimes, it is necessary to do some work inside of
-that loop, every iteration, but without counting that time to the benchmark time.
-That is possible, although it is not recommended, since it has high overhead.
-
-```c++
-static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
-  std::set<int> data;
-  for (auto _ : state) {
-    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
-    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
-    state.ResumeTiming(); // And resume timers. They are now counting again.
-    // The rest will be measured.
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
-```
-
-<a name="manual-timing" />
-
-### Manual Timing
-
-For benchmarking something for which neither CPU time nor real-time are
-correct or accurate enough, completely manual timing is supported using
-the `UseManualTime` function.
-
-When `UseManualTime` is used, the benchmarked code must call
-`SetIterationTime` once per iteration of the benchmark loop to
-report the manually measured time.
-
-An example use case for this is benchmarking GPU execution (e.g. OpenCL
-or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
-be accurately measured using CPU time or real-time. Instead, they can be
-measured accurately using a dedicated API, and these measurement results
-can be reported back with `SetIterationTime`.
-
-```c++
-static void BM_ManualTiming(benchmark::State& state) {
-  int microseconds = state.range(0);
-  std::chrono::duration<double, std::micro> sleep_duration {
-    static_cast<double>(microseconds)
-  };
-
-  for (auto _ : state) {
-    auto start = std::chrono::high_resolution_clock::now();
-    // Simulate some useful workload with a sleep
-    std::this_thread::sleep_for(sleep_duration);
-    auto end = std::chrono::high_resolution_clock::now();
-
-    auto elapsed_seconds =
-      std::chrono::duration_cast<std::chrono::duration<double>>(
-        end - start);
-
-    state.SetIterationTime(elapsed_seconds.count());
-  }
-}
-BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
-```
-
-<a name="setting-the-time-unit" />
-
-### Setting the Time Unit
-
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
-
-```c++
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-```
-
-<a name="preventing-optimization" />
-
-### Preventing Optimization
-
-To prevent a value or expression from being optimized away by the compiler
-the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
-functions can be used.
-
-```c++
-static void BM_test(benchmark::State& state) {
-  for (auto _ : state) {
-      int x = 0;
-      for (int i=0; i < 64; ++i) {
-        benchmark::DoNotOptimize(x += i);
-      }
-  }
-}
-```
-
-`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
-memory or a register. For GNU based compilers it acts as read/write barrier
-for global memory. More specifically it forces the compiler to flush pending
-writes to memory and reload any other values as necessary.
-
-Note that `DoNotOptimize(<expr>)` does not prevent optimizations on `<expr>`
-in any way. `<expr>` may even be removed entirely when the result is already
-known. For example:
-
-```c++
-  /* Example 1: `<expr>` is removed entirely. */
-  int foo(int x) { return x + 42; }
-  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
-
-  /*  Example 2: Result of '<expr>' is only reused */
-  int bar(int) __attribute__((const));
-  while (...) DoNotOptimize(bar(0)); // Optimized to:
-  // int __result__ = bar(0);
-  // while (...) DoNotOptimize(__result__);
-```
-
-The second tool for preventing optimizations is `ClobberMemory()`. In essence
-`ClobberMemory()` forces the compiler to perform all pending writes to global
-memory. Memory managed by block scope objects must be "escaped" using
-`DoNotOptimize(...)` before it can be clobbered. In the below example
-`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized
-away.
-
-```c++
-static void BM_vector_push_back(benchmark::State& state) {
-  for (auto _ : state) {
-    std::vector<int> v;
-    v.reserve(1);
-    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
-    v.push_back(42);
-    benchmark::ClobberMemory(); // Force 42 to be written to memory.
-  }
-}
-```
-
-Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
-
-<a name="reporting-statistics" />
-
-### Statistics: Reporting the Mean, Median and Standard Deviation of Repeated Benchmarks
-
-By default each benchmark is run once and that single result is reported.
-However benchmarks are often noisy and a single result may not be representative
-of the overall behavior. For this reason it's possible to repeatedly rerun the
-benchmark.
-
-The number of runs of each benchmark is specified globally by the
-`--benchmark_repetitions` flag or on a per benchmark basis by calling
-`Repetitions` on the registered benchmark object. When a benchmark is run more
-than once the mean, median and standard deviation of the runs will be reported.
-
-Additionally the `--benchmark_report_aggregates_only={true|false}`,
-`--benchmark_display_aggregates_only={true|false}` flags or
-`ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
-used to change how repeated tests are reported. By default the result of each
-repeated run is reported. When `report aggregates only` option is `true`,
-only the aggregates (i.e. mean, median and standard deviation, maybe complexity
-measurements if they were requested) of the runs is reported, to both the
-reporters - standard output (console), and the file.
-However when only the `display aggregates only` option is `true`,
-only the aggregates are displayed in the standard output, while the file
-output still contains everything.
-Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
-registered benchmark object overrides the value of the appropriate flag for that
-benchmark.
-
-<a name="custom-statistics" />
-
-### Custom Statistics
-
-While having mean, median and standard deviation is nice, this may not be
-enough for everyone. For example you may want to know what the largest
-observation is, e.g. because you have some real-time constraints. This is easy.
-The following code will specify a custom statistic to be calculated, defined
-by a lambda function.
-
-```c++
-void BM_spin_empty(benchmark::State& state) {
-  for (auto _ : state) {
-    for (int x = 0; x < state.range(0); ++x) {
-      benchmark::DoNotOptimize(x);
-    }
-  }
-}
-
-BENCHMARK(BM_spin_empty)
-  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
-    return *(std::max_element(std::begin(v), std::end(v)));
-  })
-  ->Arg(512);
-```
-
-<a name="using-register-benchmark" />
-
-### Using RegisterBenchmark(name, fn, args...)
-
-The `RegisterBenchmark(name, func, args...)` function provides an alternative
-way to create and register benchmarks.
-`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
-pointer to a new benchmark with the specified `name` that invokes
-`func(st, args...)` where `st` is a `benchmark::State` object.
-
-Unlike the `BENCHMARK` registration macros, which can only be used at the global
-scope, the `RegisterBenchmark` can be called anywhere. This allows for
-benchmark tests to be registered programmatically.
-
-Additionally `RegisterBenchmark` allows any callable object to be registered
-as a benchmark. Including capturing lambdas and function objects.
-
-For Example:
-```c++
-auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
-
-int main(int argc, char** argv) {
-  for (auto& test_input : { /* ... */ })
-      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-  benchmark::Shutdown();
-}
-```
-
-<a name="exiting-with-an-error" />
-
-### Exiting with an Error
-
-When errors caused by external influences, such as file I/O and network
-communication, occur within a benchmark the
-`State::SkipWithError(const char* msg)` function can be used to skip that run
-of benchmark and report the error. Note that only future iterations of the
-`KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
-Users must explicitly exit the loop, otherwise all iterations will be performed.
-Users may explicitly return to exit the benchmark immediately.
-
-The `SkipWithError(...)` function may be used at any point within the benchmark,
-including before and after the benchmark loop. Moreover, if `SkipWithError(...)`
-has been used, it is not required to reach the benchmark loop and one may return
-from the benchmark function early.
-
-For example:
-
-```c++
-static void BM_test(benchmark::State& state) {
-  auto resource = GetResource();
-  if (!resource.good()) {
-    state.SkipWithError("Resource is not good!");
-    // KeepRunning() loop will not be entered.
-  }
-  while (state.KeepRunning()) {
-    auto data = resource.read_data();
-    if (!resource.good()) {
-      state.SkipWithError("Failed to read data!");
-      break; // Needed to skip the rest of the iteration.
-    }
-    do_stuff(data);
-  }
-}
-
-static void BM_test_ranged_fo(benchmark::State & state) {
-  auto resource = GetResource();
-  if (!resource.good()) {
-    state.SkipWithError("Resource is not good!");
-    return; // Early return is allowed when SkipWithError() has been used.
-  }
-  for (auto _ : state) {
-    auto data = resource.read_data();
-    if (!resource.good()) {
-      state.SkipWithError("Failed to read data!");
-      break; // REQUIRED to prevent all further iterations.
-    }
-    do_stuff(data);
-  }
-}
-```
-<a name="a-faster-keep-running-loop" />
-
-### A Faster KeepRunning Loop
-
-In C++11 mode, a ranged-based for loop should be used in preference to
-the `KeepRunning` loop for running the benchmarks. For example:
-
-```c++
-static void BM_Fast(benchmark::State &state) {
-  for (auto _ : state) {
-    FastOperation();
-  }
-}
-BENCHMARK(BM_Fast);
-```
-
-The reason the ranged-for loop is faster than using `KeepRunning`, is
-because `KeepRunning` requires a memory load and store of the iteration count
-ever iteration, whereas the ranged-for variant is able to keep the iteration count
-in a register.
-
-For example, an empty inner loop of using the ranged-based for method looks like:
-
-```asm
-# Loop Init
-  mov rbx, qword ptr [r14 + 104]
-  call benchmark::State::StartKeepRunning()
-  test rbx, rbx
-  je .LoopEnd
-.LoopHeader: # =>This Inner Loop Header: Depth=1
-  add rbx, -1
-  jne .LoopHeader
-.LoopEnd:
-```
-
-Compared to an empty `KeepRunning` loop, which looks like:
-
-```asm
-.LoopHeader: # in Loop: Header=BB0_3 Depth=1
-  cmp byte ptr [rbx], 1
-  jne .LoopInit
-.LoopBody: # =>This Inner Loop Header: Depth=1
-  mov rax, qword ptr [rbx + 8]
-  lea rcx, [rax + 1]
-  mov qword ptr [rbx + 8], rcx
-  cmp rax, qword ptr [rbx + 104]
-  jb .LoopHeader
-  jmp .LoopEnd
-.LoopInit:
-  mov rdi, rbx
-  call benchmark::State::StartKeepRunning()
-  jmp .LoopBody
-.LoopEnd:
-```
-
-Unless C++03 compatibility is required, the ranged-for variant of writing
-the benchmark loop should be preferred.
-
-<a name="disabling-cpu-frequency-scaling" />
-
-### Disabling CPU Frequency Scaling
-
-If you see this error:
-
-```
-***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-```
-
-you might want to disable the CPU frequency scaling while running the benchmark:
-
-```bash
-sudo cpupower frequency-set --governor performance
-./mybench
-sudo cpupower frequency-set --governor powersave
-```
diff --git a/third-party/benchmark/WORKSPACE b/third-party/benchmark/WORKSPACE
--- a/third-party/benchmark/WORKSPACE
+++ b/third-party/benchmark/WORKSPACE
@@ -1,13 +1,7 @@
 workspace(name = "com_github_google_benchmark")
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-
-http_archive(
-    name = "rules_cc",
-    strip_prefix = "rules_cc-a508235df92e71d537fcbae0c7c952ea6957a912",
-    urls = ["https://github.com/bazelbuild/rules_cc/archive/a508235df92e71d537fcbae0c7c952ea6957a912.zip"],
-    sha256 = "d7dc12c1d5bc1a87474de8e3d17b7731a4dcebcfb8aa3990fe8ac7734ef12f2f",
-)
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
 http_archive(
     name = "com_google_absl",
@@ -16,11 +10,10 @@
     urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
 )
 
-http_archive(
+git_repository(
     name = "com_google_googletest",
-    strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
-    urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
-    sha256 = "8f827dd550db8b4fdf73904690df0be9fccc161017c9038a724bc9a0617a1bc8",
+    remote = "https://github.com/google/googletest.git",
+    tag = "release-1.11.0",
 )
 
 http_archive(
diff --git a/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc b/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc
--- a/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc
+++ b/third-party/benchmark/bindings/python/google_benchmark/benchmark.cc
@@ -165,12 +165,12 @@
                     &State::SetComplexityN)
       .def_property("items_processed", &State::items_processed,
                     &State::SetItemsProcessed)
-      .def("set_label", (void (State::*)(const char*)) & State::SetLabel)
+      .def("set_label", (void(State::*)(const char*)) & State::SetLabel)
       .def("range", &State::range, py::arg("pos") = 0)
       .def_property_readonly("iterations", &State::iterations)
       .def_readwrite("counters", &State::counters)
-      .def_readonly("thread_index", &State::thread_index)
-      .def_readonly("threads", &State::threads);
+      .def_property_readonly("thread_index", &State::thread_index)
+      .def_property_readonly("threads", &State::threads);
 
   m.def("Initialize", Initialize);
   m.def("RegisterBenchmark", RegisterBenchmark,
diff --git a/third-party/benchmark/bindings/python/google_benchmark/example.py b/third-party/benchmark/bindings/python/google_benchmark/example.py
--- a/third-party/benchmark/bindings/python/google_benchmark/example.py
+++ b/third-party/benchmark/bindings/python/google_benchmark/example.py
@@ -102,7 +102,7 @@
 
 @benchmark.register(name="sum_million_microseconds")
 @benchmark.option.unit(benchmark.kMicrosecond)
-def with_options(state):
+def with_options2(state):
     while state:
         sum(range(1_000_000))
 
diff --git a/third-party/benchmark/cmake/Config.cmake.in b/third-party/benchmark/cmake/Config.cmake.in
--- a/third-party/benchmark/cmake/Config.cmake.in
+++ b/third-party/benchmark/cmake/Config.cmake.in
@@ -1 +1,7 @@
+@PACKAGE_INIT@
+
+include (CMakeFindDependencyMacro)
+
+find_dependency (Threads)
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/third-party/benchmark/cmake/GoogleTest.cmake b/third-party/benchmark/cmake/GoogleTest.cmake
--- a/third-party/benchmark/cmake/GoogleTest.cmake
+++ b/third-party/benchmark/cmake/GoogleTest.cmake
@@ -29,13 +29,20 @@
 
 include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
 
+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+add_compile_options(-w)
+
 # Add googletest directly to our build. This defines
 # the gtest and gtest_main targets.
 add_subdirectory(${GOOGLETEST_SOURCE_DIR}
                  ${GOOGLETEST_BINARY_DIR}
                  EXCLUDE_FROM_ALL)
 
-set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
+if(NOT DEFINED GTEST_COMPILE_COMMANDS)
+    set(GTEST_COMPILE_COMMANDS ON)
+endif()
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
diff --git a/third-party/benchmark/cmake/GoogleTest.cmake.in b/third-party/benchmark/cmake/GoogleTest.cmake.in
--- a/third-party/benchmark/cmake/GoogleTest.cmake.in
+++ b/third-party/benchmark/cmake/GoogleTest.cmake.in
@@ -31,13 +31,14 @@
   )
 else()
   if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
-    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    return()
   else()
     message(WARNING "Did not find Google Test sources! Fetching from web...")
     ExternalProject_Add(
       googletest
       GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           master
+      GIT_TAG           "release-1.11.0"
       PREFIX            "${CMAKE_BINARY_DIR}"
       STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
       DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
diff --git a/third-party/benchmark/cmake/Modules/FindLLVMAr.cmake b/third-party/benchmark/cmake/Modules/FindLLVMAr.cmake
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/cmake/Modules/FindLLVMAr.cmake
@@ -0,0 +1,16 @@
+include(FeatureSummary)
+
+find_program(LLVMAR_EXECUTABLE
+  NAMES llvm-ar
+  DOC "The llvm-ar executable"
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LLVMAr
+  DEFAULT_MSG
+  LLVMAR_EXECUTABLE)
+
+SET_PACKAGE_PROPERTIES(LLVMAr PROPERTIES
+  URL https://llvm.org/docs/CommandGuide/llvm-ar.html
+  DESCRIPTION "create, modify, and extract from archives"
+)
diff --git a/third-party/benchmark/cmake/Modules/FindLLVMNm.cmake b/third-party/benchmark/cmake/Modules/FindLLVMNm.cmake
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/cmake/Modules/FindLLVMNm.cmake
@@ -0,0 +1,16 @@
+include(FeatureSummary)
+
+find_program(LLVMNM_EXECUTABLE
+  NAMES llvm-nm
+  DOC "The llvm-nm executable"
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LLVMNm
+  DEFAULT_MSG
+  LLVMNM_EXECUTABLE)
+
+SET_PACKAGE_PROPERTIES(LLVMNm PROPERTIES
+  URL https://llvm.org/docs/CommandGuide/llvm-nm.html
+  DESCRIPTION "list LLVM bitcode and object file’s symbol table"
+)
diff --git a/third-party/benchmark/cmake/Modules/FindLLVMRanLib.cmake b/third-party/benchmark/cmake/Modules/FindLLVMRanLib.cmake
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/cmake/Modules/FindLLVMRanLib.cmake
@@ -0,0 +1,15 @@
+include(FeatureSummary)
+
+find_program(LLVMRANLIB_EXECUTABLE
+  NAMES llvm-ranlib
+  DOC "The llvm-ranlib executable"
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LLVMRanLib
+  DEFAULT_MSG
+  LLVMRANLIB_EXECUTABLE)
+
+SET_PACKAGE_PROPERTIES(LLVMRanLib PROPERTIES
+  DESCRIPTION "generate index for LLVM archive"
+)
diff --git a/third-party/benchmark/cmake/Modules/FindPFM.cmake b/third-party/benchmark/cmake/Modules/FindPFM.cmake
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/cmake/Modules/FindPFM.cmake
@@ -0,0 +1,26 @@
+# If successful, the following variables will be defined:
+# HAVE_LIBPFM.
+# Set BENCHMARK_ENABLE_LIBPFM to 0 to disable, regardless of libpfm presence.
+include(CheckIncludeFile)
+include(CheckLibraryExists)
+include(FeatureSummary)
+enable_language(C)
+
+set_package_properties(PFM PROPERTIES
+                       URL http://perfmon2.sourceforge.net/
+                       DESCRIPTION "a helper library to develop monitoring tools"
+                       PURPOSE "Used to program specific performance monitoring events")
+
+check_library_exists(libpfm.a pfm_initialize "" HAVE_LIBPFM_INITIALIZE)
+if(HAVE_LIBPFM_INITIALIZE)
+  check_include_file(perfmon/perf_event.h HAVE_PERFMON_PERF_EVENT_H)
+  check_include_file(perfmon/pfmlib.h HAVE_PERFMON_PFMLIB_H)
+  check_include_file(perfmon/pfmlib_perf_event.h HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
+  if(HAVE_PERFMON_PERF_EVENT_H AND HAVE_PERFMON_PFMLIB_H AND HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
+    message("Using Perf Counters.")
+    set(HAVE_LIBPFM 1)
+    set(PFM_FOUND 1)
+  endif()
+else()
+  message("Perf Counters support requested, but was unable to find libpfm.")
+endif()
diff --git a/third-party/benchmark/docs/_config.yml b/third-party/benchmark/docs/_config.yml
--- a/third-party/benchmark/docs/_config.yml
+++ b/third-party/benchmark/docs/_config.yml
@@ -1 +1 @@
-theme: jekyll-theme-hacker
\ No newline at end of file
+theme: jekyll-theme-minimal
\ No newline at end of file
diff --git a/third-party/benchmark/dependencies.md b/third-party/benchmark/docs/dependencies.md
rename from third-party/benchmark/dependencies.md
rename to third-party/benchmark/docs/dependencies.md
--- a/third-party/benchmark/dependencies.md
+++ b/third-party/benchmark/docs/dependencies.md
@@ -3,16 +3,17 @@
 To ensure the broadest compatibility when building the benchmark library, but
 still allow forward progress, we require any build tooling to be available for:
 
-* Debian stable AND
-* The last two Ubuntu LTS releases AND
+* Debian stable _and_
+* The last two Ubuntu LTS releases
 
 Currently, this means using build tool versions that are available for Ubuntu
-16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
+18.04 (Bionic Beaver), Ubuntu 20.04 (Focal Fossa), and Debian 11 (bullseye).
 
-_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
+_Note, CI also runs ubuntu-16.04 and ubuntu-14.04 to ensure best effort support
+for older versions._
 
 ## cmake
 The current supported version is cmake 3.5.1 as of 2018-06-06.
 
-_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
+_Note, this version is also available for Ubuntu 14.04, an older Ubuntu LTS
 release, as `cmake3`._
diff --git a/third-party/benchmark/docs/index.md b/third-party/benchmark/docs/index.md
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/docs/index.md
@@ -0,0 +1,10 @@
+# Benchmark
+
+* [Assembly Tests](AssemblyTests.md)
+* [Dependencies](dependencies.md)
+* [Perf Counters](perf_counters.md)
+* [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Random Interleaving](random_interleaving.md)
+* [Releasing](releasing.md)
+* [Tools](tools.md)
+* [User Guide](user_guide.md)
\ No newline at end of file
diff --git a/third-party/benchmark/docs/platform_specific_build_instructions.md b/third-party/benchmark/docs/platform_specific_build_instructions.md
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/docs/platform_specific_build_instructions.md
@@ -0,0 +1,48 @@
+# Platform Specific Build Instructions
+
+## Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+On QNX, the pthread library is part of libc and usually included automatically
+(see
+[`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
+There's no separate pthread library to link.
+
+## Building with Visual Studio 2015 or 2017
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmarkd.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+## Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+## Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
\ No newline at end of file
diff --git a/third-party/benchmark/docs/releasing.md b/third-party/benchmark/docs/releasing.md
--- a/third-party/benchmark/docs/releasing.md
+++ b/third-party/benchmark/docs/releasing.md
@@ -8,10 +8,23 @@
     * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
       commits between the last annotated tag and HEAD
     * Pick the most interesting.
-* Create one last commit that updates the version saved in `CMakeLists.txt` to the release version you're creating. (This version will be used if benchmark is installed from the archive you'll be creating in the next step.)
+* Create one last commit that updates the version saved in `CMakeLists.txt` and the
+  `__version__` variable in `bindings/python/google_benchmark/__init__.py`to the release
+  version you're creating. (This version will be used if benchmark is installed from the
+  archive you'll be creating in the next step.)
 
 ```
-project (benchmark VERSION 1.5.3 LANGUAGES CXX)
+project (benchmark VERSION 1.6.0 LANGUAGES CXX)
+```
+
+```python
+# bindings/python/google_benchmark/__init__.py
+
+# ...
+
+__version__ = "1.6.0"  # <-- change this to the release version you are creating
+
+# ...
 ```
 
 * Create a release through github's interface
@@ -19,4 +32,4 @@
     * Update this to an annotated tag:
       * `git pull --tags`
       * `git tag -a -f <tag> <tag>`
-      * `git push --force origin`
+      * `git push --force --tags origin`
diff --git a/third-party/benchmark/README.md b/third-party/benchmark/docs/user_guide.md
copy from third-party/benchmark/README.md
copy to third-party/benchmark/docs/user_guide.md
--- a/third-party/benchmark/README.md
+++ b/third-party/benchmark/docs/user_guide.md
@@ -1,267 +1,6 @@
-# Benchmark
+# User Guide
 
-[![build-and-test](https://github.com/google/benchmark/workflows/build-and-test/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Abuild-and-test)
-[![bazel](https://github.com/google/benchmark/actions/workflows/bazel.yml/badge.svg)](https://github.com/google/benchmark/actions/workflows/bazel.yml)
-[![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
-[![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
-
-[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
-[![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
-
-
-A library to benchmark code snippets, similar to unit tests. Example:
-
-```c++
-#include <benchmark/benchmark.h>
-
-static void BM_SomeFunction(benchmark::State& state) {
-  // Perform setup here
-  for (auto _ : state) {
-    // This code gets timed
-    SomeFunction();
-  }
-}
-// Register the function as a benchmark
-BENCHMARK(BM_SomeFunction);
-// Run the benchmark
-BENCHMARK_MAIN();
-```
-
-To get started, see [Requirements](#requirements) and
-[Installation](#installation). See [Usage](#usage) for a full example and the
-[User Guide](#user-guide) for a more comprehensive feature overview.
-
-It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/docs/primer.md)
-as some of the structural aspects of the APIs are similar.
-
-### Resources
-
-[Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
-
-IRC channels:
-* [libera](https://libera.chat) #benchmark
-
-[Additional Tooling Documentation](docs/tools.md)
-
-[Assembly Testing Documentation](docs/AssemblyTests.md)
-
-## Requirements
-
-The library can be used with C++03. However, it requires C++11 to build,
-including compiler and standard library support.
-
-The following minimum versions are required to build the library:
-
-* GCC 4.8
-* Clang 3.4
-* Visual Studio 14 2015
-* Intel 2015 Update 1
-
-See [Platform-Specific Build Instructions](#platform-specific-build-instructions).
-
-## Installation
-
-This describes the installation process using cmake. As pre-requisites, you'll
-need git and cmake installed.
-
-_See [dependencies.md](dependencies.md) for more details regarding supported
-versions of build tools._
-
-```bash
-# Check out the library.
-$ git clone https://github.com/google/benchmark.git
-# Benchmark requires Google Test as a dependency. Add the source tree as a subdirectory.
-$ git clone https://github.com/google/googletest.git benchmark/googletest
-# Go to the library root directory
-$ cd benchmark
-# Make a build directory to place the build output.
-$ cmake -E make_directory "build"
-# Generate build system files with cmake.
-$ cmake -E chdir "build" cmake -DCMAKE_BUILD_TYPE=Release ../
-# or, starting with CMake 3.13, use a simpler form:
-# cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
-# Build the library.
-$ cmake --build "build" --config Release
-```
-This builds the `benchmark` and `benchmark_main` libraries and tests.
-On a unix system, the build directory should now look something like this:
-
-```
-/benchmark
-  /build
-    /src
-      /libbenchmark.a
-      /libbenchmark_main.a
-    /test
-      ...
-```
-
-Next, you can run the tests to check the build.
-
-```bash
-$ cmake -E chdir "build" ctest --build-config Release
-```
-
-If you want to install the library globally, also run:
-
-```
-sudo cmake --build "build" --config Release --target install
-```
-
-Note that Google Benchmark requires Google Test to build and run the tests. This
-dependency can be provided two ways:
-
-* Checkout the Google Test sources into `benchmark/googletest` as above.
-* Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
-  configuration, the library will automatically download and build any required
-  dependencies.
-
-If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
-to `CMAKE_ARGS`.
-
-### Debug vs Release
-
-By default, benchmark builds as a debug library. You will see a warning in the
-output when this is the case. To build it as a release library instead, add
-`-DCMAKE_BUILD_TYPE=Release` when generating the build system files, as shown
-above. The use of `--config Release` in build commands is needed to properly
-support multi-configuration tools (like Visual Studio for example) and can be
-skipped for other build systems (like Makefile).
-
-To enable link-time optimisation, also add `-DBENCHMARK_ENABLE_LTO=true` when
-generating the build system files.
-
-If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
-cache variables, if autodetection fails.
-
-If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
-`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
-
-### Stable and Experimental Library Versions
-
-The main branch contains the latest stable version of the benchmarking library;
-the API of which can be considered largely stable, with source breaking changes
-being made only upon the release of a new major version.
-
-Newer, experimental, features are implemented and tested on the
-[`v2` branch](https://github.com/google/benchmark/tree/v2). Users who wish
-to use, test, and provide feedback on the new features are encouraged to try
-this branch. However, this branch provides no stability guarantees and reserves
-the right to change and break the API at any time.
-
-## Usage
-
-### Basic usage
-
-Define a function that executes the code to measure, register it as a benchmark
-function using the `BENCHMARK` macro, and ensure an appropriate `main` function
-is available:
-
-```c++
-#include <benchmark/benchmark.h>
-
-static void BM_StringCreation(benchmark::State& state) {
-  for (auto _ : state)
-    std::string empty_string;
-}
-// Register the function as a benchmark
-BENCHMARK(BM_StringCreation);
-
-// Define another benchmark
-static void BM_StringCopy(benchmark::State& state) {
-  std::string x = "hello";
-  for (auto _ : state)
-    std::string copy(x);
-}
-BENCHMARK(BM_StringCopy);
-
-BENCHMARK_MAIN();
-```
-
-To run the benchmark, compile and link against the `benchmark` library
-(libbenchmark.a/.so). If you followed the build steps above, this library will 
-be under the build directory you created.
-
-```bash
-# Example on linux after running the build steps above. Assumes the
-# `benchmark` and `build` directories are under the current directory.
-$ g++ mybenchmark.cc -std=c++11 -isystem benchmark/include \
-  -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
-```
-
-Alternatively, link against the `benchmark_main` library and remove
-`BENCHMARK_MAIN();` above to get the same behavior.
-
-The compiled executable will run all benchmarks by default. Pass the `--help`
-flag for option information or see the guide below.
-
-### Usage with CMake
-
-If using CMake, it is recommended to link against the project-provided
-`benchmark::benchmark` and `benchmark::benchmark_main` targets using
-`target_link_libraries`.
-It is possible to use ```find_package``` to import an installed version of the
-library.
-```cmake
-find_package(benchmark REQUIRED)
-```
-Alternatively, ```add_subdirectory``` will incorporate the library directly in
-to one's CMake project.
-```cmake
-add_subdirectory(benchmark)
-```
-Either way, link to the library as follows.
-```cmake
-target_link_libraries(MyTarget benchmark::benchmark)
-```
-
-## Platform Specific Build Instructions
-
-### Building with GCC
-
-When the library is built using GCC it is necessary to link with the pthread
-library due to how GCC implements `std::thread`. Failing to link to pthread will
-lead to runtime exceptions (unless you're using libc++), not linker errors. See
-[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
-can link to pthread by adding `-pthread` to your linker command. Note, you can
-also use `-lpthread`, but there are potential issues with ordering of command
-line parameters if you use that.
-
-### Building with Visual Studio 2015 or 2017
-
-The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
-
-```
-// Alternatively, can add libraries using linker options.
-#ifdef _WIN32
-#pragma comment ( lib, "Shlwapi.lib" )
-#ifdef _DEBUG
-#pragma comment ( lib, "benchmarkd.lib" )
-#else
-#pragma comment ( lib, "benchmark.lib" )
-#endif
-#endif
-```
-
-Can also use the graphical version of CMake:
-* Open `CMake GUI`.
-* Under `Where to build the binaries`, same path as source plus `build`.
-* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
-* Click `Configure`, `Generate`, `Open Project`.
-* If build fails, try deleting entire directory and starting again, or unticking options to build less.
-
-### Building with Intel 2015 Update 1 or Intel System Studio Update 4
-
-See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
-
-### Building on Solaris
-
-If you're running benchmarks on solaris, you'll want the kstat library linked in
-too (`-lkstat`).
-
-## User Guide
-
-### Command Line
+## Command Line
 
 [Output Formats](#output-formats)
 
@@ -275,10 +14,12 @@
 
 [Extra Context](#extra-context)
 
-### Library
+## Library
 
 [Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
 
+[Setup/Teardown](#setupteardown)
+
 [Passing Arguments](#passing-arguments)
 
 [Custom Benchmark Name](#custom-benchmark-name)
@@ -299,9 +40,9 @@
 
 [Setting the Time Unit](#setting-the-time-unit)
 
-[Random Interleaving](docs/random_interleaving.md)
+[Random Interleaving](random_interleaving.md)
 
-[User-Requested Performance Counters](docs/perf_counters.md)
+[User-Requested Performance Counters](perf_counters.md)
 
 [Preventing Optimization](#preventing-optimization)
 
@@ -320,7 +61,7 @@
 
 <a name="output-formats" />
 
-### Output Formats
+## Output Formats
 
 The library supports multiple output formats. Use the
 `--benchmark_format=<console|json|csv>` flag (or set the
@@ -395,21 +136,21 @@
 
 <a name="output-files" />
 
-### Output Files
+## Output Files
 
 Write benchmark results to a file with the `--benchmark_out=<filename>` option
 (or set `BENCHMARK_OUT`). Specify the output format with
 `--benchmark_out_format={json|console|csv}` (or set
 `BENCHMARK_OUT_FORMAT={json|console|csv}`). Note that the 'csv' reporter is
-deprecated and the saved `.csv` file 
-[is not parsable](https://github.com/google/benchmark/issues/794) by csv 
+deprecated and the saved `.csv` file
+[is not parsable](https://github.com/google/benchmark/issues/794) by csv
 parsers.
 
 Specifying `--benchmark_out` does not suppress the console output.
 
 <a name="running-benchmarks" />
 
-### Running Benchmarks
+## Running Benchmarks
 
 Benchmarks are executed by running the produced binaries. Benchmarks binaries,
 by default, accept options that may be specified either through their command
@@ -421,7 +162,7 @@
 
 <a name="running-a-subset-of-benchmarks" />
 
-### Running a Subset of Benchmarks
+## Running a Subset of Benchmarks
 
 The `--benchmark_filter=<regex>` option (or `BENCHMARK_FILTER=<regex>`
 environment variable) can be used to only run the benchmarks that match
@@ -441,14 +182,14 @@
 
 <a name="result-comparison" />
 
-### Result comparison
+## Result comparison
 
 It is possible to compare the benchmarking results.
-See [Additional Tooling Documentation](docs/tools.md)
+See [Additional Tooling Documentation](tools.md)
 
 <a name="extra-context" />
 
-### Extra Context
+## Extra Context
 
 Sometimes it's useful to add extra context to the content printed before the
 results. By default this section includes information about the CPU on which
@@ -476,7 +217,7 @@
 
 <a name="runtime-and-reporting-considerations" />
 
-### Runtime and Reporting Considerations
+## Runtime and Reporting Considerations
 
 When the benchmark binary is executed, each benchmark function is run serially.
 The number of iterations to run is determined dynamically by running the
@@ -499,9 +240,41 @@
 As well as the per-benchmark entries, a preamble in the report will include
 information about the machine on which the benchmarks are run.
 
+<a name="setup-teardown" />
+
+## Setup/Teardown
+
+Global setup/teardown specific to each benchmark can be done by
+passing a callback to Setup/Teardown:
+
+The setup/teardown callbacks will be invoked once for each benchmark.
+If the benchmark is multi-threaded (will run in k threads), they will be invoked exactly once before
+each run with k threads.
+If the benchmark uses different size groups of threads, the above will be true for each size group.
+
+Eg.,
+
+```c++
+static void DoSetup(const benchmark::State& state) {
+}
+
+static void DoTeardown(const benchmark::State& state) {
+}
+
+static void BM_func(benchmark::State& state) {...}
+
+BENCHMARK(BM_func)->Arg(1)->Arg(3)->Threads(16)->Threads(32)->Setup(DoSetup)->Teardown(DoTeardown);
+
+```
+
+In this example, `DoSetup` and `DoTearDown` will be invoked 4 times each,
+specifically, once for each of this family:
+ - BM_func_Arg_1_Threads_16, BM_func_Arg_1_Threads_32
+ - BM_func_Arg_3_Threads_16, BM_func_Arg_3_Threads_32
+
 <a name="passing-arguments" />
 
-### Passing Arguments
+## Passing Arguments
 
 Sometimes a family of benchmarks can be implemented with just one routine that
 takes an extra argument to specify which one of the family of benchmarks to
@@ -589,14 +362,17 @@
 product of the two specified ranges and will generate a benchmark for each such
 pair.
 
+{% raw %}
 ```c++
 BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
+{% endraw %}
 
 Some benchmarks may require specific argument values that cannot be expressed
 with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
 benchmark input for each combination in the product of the supplied vectors.
 
+{% raw %}
 ```c++
 BENCHMARK(BM_SetInsert)
     ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
@@ -615,6 +391,24 @@
     ->Args({3<<10, 80})
     ->Args({8<<10, 80});
 ```
+{% endraw %}
+
+For the most common scenarios, helper methods for creating a list of
+integers for a given sparse or dense range are provided.
+
+```c++
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({
+      benchmark::CreateRange(8, 128, /*multi=*/2),
+      benchmark::CreateDenseRange(1, 4, /*step=*/1)
+    })
+// would generate the same benchmark arguments as
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({
+      {8, 16, 32, 64, 128},
+      {1, 2, 3, 4}
+    });
+```
 
 For more complex patterns of inputs, passing a custom function to `Apply` allows
 programmatic specification of an arbitrary set of arguments on which to run the
@@ -630,7 +424,7 @@
 BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
 ```
 
-#### Passing Arbitrary Arguments to a Benchmark
+### Passing Arbitrary Arguments to a Benchmark
 
 In C++11 it is possible to define a benchmark that takes an arbitrary number
 of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
@@ -654,7 +448,7 @@
 
 <a name="asymptotic-complexity" />
 
-### Calculating Asymptotic Complexity (Big O)
+## Calculating Asymptotic Complexity (Big O)
 
 Asymptotic complexity might be calculated for a family of benchmarks. The
 following code will calculate the coefficient for the high-order term in the
@@ -691,7 +485,7 @@
 
 <a name="custom-benchmark-name" />
 
-### Custom Benchmark Name
+## Custom Benchmark Name
 
 You can change the benchmark's name as follows:
 
@@ -704,7 +498,7 @@
 
 <a name="templated-benchmarks" />
 
-### Templated Benchmarks
+## Templated Benchmarks
 
 This example produces and consumes messages of size `sizeof(v)` `range_x`
 times. It also outputs throughput in the absence of multiprogramming.
@@ -723,14 +517,19 @@
   state.SetBytesProcessed(
       static_cast<int64_t>(state.iterations())*state.range(0));
 }
+// C++03
 BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+// C++11 or newer, you can use the BENCHMARK macro with template parameters:
+BENCHMARK(BM_Sequential<WaitQueue<int>>)->Range(1<<0, 1<<10);
+
 ```
 
 Three macros are provided for adding benchmark templates.
 
 ```c++
 #ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
+#define BENCHMARK(func<...>) // Takes any number of parameters.
 #else // C++ < C++11
 #define BENCHMARK_TEMPLATE(func, arg1)
 #endif
@@ -740,7 +539,7 @@
 
 <a name="fixtures" />
 
-### Fixtures
+## Fixtures
 
 Fixture tests are created by first defining a type that derives from
 `::benchmark::Fixture` and then creating/registering the tests using the
@@ -778,7 +577,7 @@
 /* BarTest is now registered */
 ```
 
-#### Templated Fixtures
+### Templated Fixtures
 
 Also you can create templated fixture by using the following macros:
 
@@ -808,7 +607,7 @@
 
 <a name="custom-counters" />
 
-### Custom Counters
+## Custom Counters
 
 You can add your own counters with user-defined names. The example below
 will add columns "Foo", "Bar" and "Baz" in its output:
@@ -869,6 +668,7 @@
 When you're compiling in C++11 mode or later you can use `insert()` with
 `std::initializer_list`:
 
+{% raw %}
 ```c++
   // With C++11, this can be done:
   state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
@@ -877,8 +677,9 @@
   state.counters["Bar"] = numBars;
   state.counters["Baz"] = numBazs;
 ```
+{% endraw %}
 
-#### Counter Reporting
+### Counter Reporting
 
 When using the console reporter, by default, user counters are printed at
 the end after the table, the same way as ``bytes_processed`` and
@@ -948,7 +749,7 @@
 
 <a name="multithreaded-benchmarks"/>
 
-### Multithreaded Benchmarks
+## Multithreaded Benchmarks
 
 In a multithreaded test (benchmark invoked by multiple threads simultaneously),
 it is guaranteed that none of the threads will start until all have reached
@@ -959,13 +760,13 @@
 
 ```c++
 static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Setup code here.
   }
   for (auto _ : state) {
     // Run the test as normal.
   }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Teardown code here.
   }
 }
@@ -984,7 +785,7 @@
 
 <a name="cpu-timers" />
 
-### CPU Timers
+## CPU Timers
 
 By default, the CPU timer only measures the time spent by the main thread.
 If the benchmark itself uses threads internally, this measurement may not
@@ -1028,13 +829,14 @@
 BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
 ```
 
-#### Controlling Timers
+### Controlling Timers
 
 Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
 is measured. But sometimes, it is necessary to do some work inside of
 that loop, every iteration, but without counting that time to the benchmark time.
 That is possible, although it is not recommended, since it has high overhead.
 
+{% raw %}
 ```c++
 static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
   std::set<int> data;
@@ -1049,10 +851,11 @@
 }
 BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
+{% endraw %}
 
 <a name="manual-timing" />
 
-### Manual Timing
+## Manual Timing
 
 For benchmarking something for which neither CPU time nor real-time are
 correct or accurate enough, completely manual timing is supported using
@@ -1093,7 +896,7 @@
 
 <a name="setting-the-time-unit" />
 
-### Setting the Time Unit
+## Setting the Time Unit
 
 If a benchmark runs a few milliseconds it may be hard to visually compare the
 measured times, since the output data is given in nanoseconds per default. In
@@ -1105,7 +908,7 @@
 
 <a name="preventing-optimization" />
 
-### Preventing Optimization
+## Preventing Optimization
 
 To prevent a value or expression from being optimized away by the compiler
 the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
@@ -1166,7 +969,7 @@
 
 <a name="reporting-statistics" />
 
-### Statistics: Reporting the Mean, Median and Standard Deviation of Repeated Benchmarks
+## Statistics: Reporting the Mean, Median and Standard Deviation / Coefficient of variation of Repeated Benchmarks
 
 By default each benchmark is run once and that single result is reported.
 However benchmarks are often noisy and a single result may not be representative
@@ -1176,16 +979,17 @@
 The number of runs of each benchmark is specified globally by the
 `--benchmark_repetitions` flag or on a per benchmark basis by calling
 `Repetitions` on the registered benchmark object. When a benchmark is run more
-than once the mean, median and standard deviation of the runs will be reported.
+than once the mean, median, standard deviation and coefficient of variation
+of the runs will be reported.
 
 Additionally the `--benchmark_report_aggregates_only={true|false}`,
 `--benchmark_display_aggregates_only={true|false}` flags or
 `ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
 used to change how repeated tests are reported. By default the result of each
 repeated run is reported. When `report aggregates only` option is `true`,
-only the aggregates (i.e. mean, median and standard deviation, maybe complexity
-measurements if they were requested) of the runs is reported, to both the
-reporters - standard output (console), and the file.
+only the aggregates (i.e. mean, median, standard deviation and coefficient
+of variation, maybe complexity measurements if they were requested) of the runs
+is reported, to both the reporters - standard output (console), and the file.
 However when only the `display aggregates only` option is `true`,
 only the aggregates are displayed in the standard output, while the file
 output still contains everything.
@@ -1195,13 +999,12 @@
 
 <a name="custom-statistics" />
 
-### Custom Statistics
+## Custom Statistics
 
-While having mean, median and standard deviation is nice, this may not be
-enough for everyone. For example you may want to know what the largest
-observation is, e.g. because you have some real-time constraints. This is easy.
-The following code will specify a custom statistic to be calculated, defined
-by a lambda function.
+While having these aggregates is nice, this may not be enough for everyone.
+For example you may want to know what the largest observation is, e.g. because
+you have some real-time constraints. This is easy. The following code will
+specify a custom statistic to be calculated, defined by a lambda function.
 
 ```c++
 void BM_spin_empty(benchmark::State& state) {
@@ -1219,9 +1022,28 @@
   ->Arg(512);
 ```
 
+While usually the statistics produce values in time units,
+you can also produce percentages:
+
+```c++
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK(BM_spin_empty)
+  ->ComputeStatistics("ratio", [](const std::vector<double>& v) -> double {
+    return std::begin(v) / std::end(v);
+  }, benchmark::StatisticUnit::Percentage)
+  ->Arg(512);
+```
+
 <a name="using-register-benchmark" />
 
-### Using RegisterBenchmark(name, fn, args...)
+## Using RegisterBenchmark(name, fn, args...)
 
 The `RegisterBenchmark(name, func, args...)` function provides an alternative
 way to create and register benchmarks.
@@ -1251,7 +1073,7 @@
 
 <a name="exiting-with-an-error" />
 
-### Exiting with an Error
+## Exiting with an Error
 
 When errors caused by external influences, such as file I/O and network
 communication, occur within a benchmark the
@@ -1303,7 +1125,7 @@
 ```
 <a name="a-faster-keep-running-loop" />
 
-### A Faster KeepRunning Loop
+## A Faster KeepRunning Loop
 
 In C++11 mode, a ranged-based for loop should be used in preference to
 the `KeepRunning` loop for running the benchmarks. For example:
@@ -1361,7 +1183,7 @@
 
 <a name="disabling-cpu-frequency-scaling" />
 
-### Disabling CPU Frequency Scaling
+## Disabling CPU Frequency Scaling
 
 If you see this error:
 
diff --git a/third-party/benchmark/include/benchmark/benchmark.h b/third-party/benchmark/include/benchmark/benchmark.h
--- a/third-party/benchmark/include/benchmark/benchmark.h
+++ b/third-party/benchmark/include/benchmark/benchmark.h
@@ -34,7 +34,7 @@
 BENCHMARK(BM_StringCopy);
 
 // Augment the main() program to invoke benchmarks if specified
-// via the --benchmarks command line flag.  E.g.,
+// via the --benchmark_filter command line flag.  E.g.,
 //       my_unittest --benchmark_filter=all
 //       my_unittest --benchmark_filter=BM_StringCreation
 //       my_unittest --benchmark_filter=String
@@ -140,13 +140,13 @@
 do can be wrapped in a check against the thread index:
 
 static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Setup code here.
   }
   for (auto _ : state) {
     // Run the test as normal.
   }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Teardown code here.
   }
 }
@@ -180,6 +180,7 @@
 #include <cassert>
 #include <cstddef>
 #include <iosfwd>
+#include <limits>
 #include <map>
 #include <set>
 #include <string>
@@ -187,6 +188,7 @@
 #include <vector>
 
 #if defined(BENCHMARK_HAS_CXX11)
+#include <atomic>
 #include <initializer_list>
 #include <type_traits>
 #include <utility>
@@ -237,16 +239,24 @@
 #define BENCHMARK_INTERNAL_TOSTRING2(x) #x
 #define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
 
+// clang-format off
 #if defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
 #define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  _Pragma("GCC diagnostic push")             \
+  _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("GCC diagnostic pop")
 #else
 #define BENCHMARK_BUILTIN_EXPECT(x, y) x
 #define BENCHMARK_DEPRECATED_MSG(msg)
 #define BENCHMARK_WARNING_MSG(msg)                           \
   __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING( \
       __LINE__) ") : warning note: " msg))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING
 #endif
+// clang-format on
 
 #if defined(__GNUC__) && !defined(__clang__)
 #define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
@@ -272,7 +282,6 @@
 
 namespace benchmark {
 class BenchmarkReporter;
-class MemoryManager;
 
 void Initialize(int* argc, char** argv);
 void Shutdown();
@@ -281,11 +290,18 @@
 // Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
 bool ReportUnrecognizedArguments(int argc, char** argv);
 
+// Returns the current value of --benchmark_filter.
+std::string GetBenchmarkFilter();
+
 // Generate a list of benchmarks matching the specified --benchmark_filter flag
 // and if --benchmark_list_tests is specified return after printing the name
 // of each matching benchmark. Otherwise run each matching benchmark and
 // report the results.
 //
+// spec : Specify the benchmarks to run. If users do not specify this arg,
+//        then the value of FLAGS_benchmark_filter
+//        will be used.
+//
 // The second and third overload use the specified 'display_reporter' and
 //  'file_reporter' respectively. 'file_reporter' will write to the file
 //  specified
@@ -294,9 +310,62 @@
 //
 // RETURNS: The number of matching benchmarks.
 size_t RunSpecifiedBenchmarks();
+size_t RunSpecifiedBenchmarks(std::string spec);
+
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              std::string spec);
+
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                               BenchmarkReporter* file_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter,
+                              std::string spec);
+
+// If a MemoryManager is registered (via RegisterMemoryManager()),
+// it can be used to collect and report allocation metrics for a run of the
+// benchmark.
+class MemoryManager {
+ public:
+  static const int64_t TombstoneValue;
+
+  struct Result {
+    Result()
+        : num_allocs(0),
+          max_bytes_used(0),
+          total_allocated_bytes(TombstoneValue),
+          net_heap_growth(TombstoneValue) {}
+
+    // The number of allocations made in total between Start and Stop.
+    int64_t num_allocs;
+
+    // The peak memory use between Start and Stop.
+    int64_t max_bytes_used;
+
+    // The total memory allocated, in bytes, between Start and Stop.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t total_allocated_bytes;
+
+    // The net changes in memory, in bytes, between Start and Stop.
+    // ie., total_allocated_bytes - total_deallocated_bytes.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t net_heap_growth;
+  };
+
+  virtual ~MemoryManager() {}
+
+  // Implement this to start recording allocation information.
+  virtual void Start() = 0;
+
+  // Implement this to stop recording and fill out the given Result structure.
+  BENCHMARK_DEPRECATED_MSG("Use Stop(Result&) instead")
+  virtual void Stop(Result* result) = 0;
+
+  // FIXME(vyng): Make this pure virtual once we've migrated current users.
+  BENCHMARK_DISABLE_DEPRECATED_WARNING
+  virtual void Stop(Result& result) { Stop(&result); }
+  BENCHMARK_RESTORE_DEPRECATED_WARNING
+};
 
 // Register a MemoryManager instance that will be used to collect and report
 // allocation measurements for benchmark runs.
@@ -327,6 +396,14 @@
 #define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 #endif
 
+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+#ifdef BENCHMARK_HAS_CXX11
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  std::atomic_signal_fence(std::memory_order_acq_rel);
+}
+#endif
+
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
@@ -346,11 +423,11 @@
 #endif
 }
 
-// Force the compiler to flush pending writes to global memory. Acts as an
-// effective read/write barrier
+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
   asm volatile("" : : : "memory");
 }
+#endif
 #elif defined(_MSC_VER)
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
@@ -358,13 +435,15 @@
   _ReadWriteBarrier();
 }
 
+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
+#endif
 #else
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
-// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers, before C++11.
 #endif
 
 // This class is used for user-defined counters.
@@ -374,27 +453,27 @@
     kDefaults = 0,
     // Mark the counter as a rate. It will be presented divided
     // by the duration of the benchmark.
-    kIsRate = 1U << 0U,
+    kIsRate = 1 << 0,
     // Mark the counter as a thread-average quantity. It will be
     // presented divided by the number of threads.
-    kAvgThreads = 1U << 1U,
+    kAvgThreads = 1 << 1,
     // Mark the counter as a thread-average rate. See above.
     kAvgThreadsRate = kIsRate | kAvgThreads,
     // Mark the counter as a constant value, valid/same for *every* iteration.
     // When reporting, it will be *multiplied* by the iteration count.
-    kIsIterationInvariant = 1U << 2U,
+    kIsIterationInvariant = 1 << 2,
     // Mark the counter as a constant rate.
     // When reporting, it will be *multiplied* by the iteration count
     // and then divided by the duration of the benchmark.
     kIsIterationInvariantRate = kIsRate | kIsIterationInvariant,
     // Mark the counter as a iteration-average quantity.
     // It will be presented divided by the number of iterations.
-    kAvgIterations = 1U << 3U,
+    kAvgIterations = 1 << 3,
     // Mark the counter as a iteration-average rate. See above.
     kAvgIterationsRate = kIsRate | kAvgIterations,
 
     // In the end, invert the result. This is always done last!
-    kInvert = 1U << 31U
+    kInvert = 1 << 31
   };
 
   enum OneK {
@@ -412,7 +491,7 @@
   Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
       : value(v), flags(f), oneK(k) {}
 
-  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double const &() const { return value; }
   BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
 };
 
@@ -439,6 +518,8 @@
 
 typedef uint64_t IterationCount;
 
+enum StatisticUnit { kTime, kPercentage };
+
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
 typedef double(BigOFunc)(IterationCount);
@@ -451,9 +532,11 @@
 struct Statistics {
   std::string name_;
   StatisticsFunc* compute_;
+  StatisticUnit unit_;
 
-  Statistics(const std::string& name, StatisticsFunc* compute)
-      : name_(name), compute_(compute) {}
+  Statistics(const std::string& name, StatisticsFunc* compute,
+             StatisticUnit unit = kTime)
+      : name_(name), compute_(compute), unit_(unit) {}
 };
 
 class BenchmarkInstance;
@@ -656,6 +739,14 @@
   BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
   int64_t range_y() const { return range(1); }
 
+  // Number of threads concurrently executing the benchmark.
+  BENCHMARK_ALWAYS_INLINE
+  int threads() const { return threads_; }
+
+  // Index of the executing thread. Values from [0, threads).
+  BENCHMARK_ALWAYS_INLINE
+  int thread_index() const { return thread_index_; }
+
   BENCHMARK_ALWAYS_INLINE
   IterationCount iterations() const {
     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
@@ -664,8 +755,8 @@
     return max_iterations - total_iterations_ + batch_leftover_;
   }
 
- private
-     :  // items we expect on the first cache line (ie 64 bytes of the struct)
+ private:
+  // items we expect on the first cache line (ie 64 bytes of the struct)
   // When total_iterations_ is 0, KeepRunning() and friends will return false.
   // May be larger than max_iterations.
   IterationCount total_iterations_;
@@ -683,7 +774,7 @@
   bool finished_;
   bool error_occurred_;
 
- private:  // items we don't need on the first cache line
+  // items we don't need on the first cache line
   std::vector<int64_t> range_;
 
   int64_t complexity_n_;
@@ -691,10 +782,6 @@
  public:
   // Container for user-defined counters.
   UserCounters counters;
-  // Index of the executing thread. Values from [0, threads).
-  const int thread_index;
-  // Number of threads concurrently executing the benchmark.
-  const int threads;
 
  private:
   State(IterationCount max_iters, const std::vector<int64_t>& ranges,
@@ -707,6 +794,10 @@
   // is_batch must be true unless n is 1.
   bool KeepRunningInternal(IterationCount n, bool is_batch);
   void FinishKeepRunning();
+
+  const int thread_index_;
+  const int threads_;
+
   internal::ThreadTimer* const timer_;
   internal::ThreadManager* const manager_;
   internal::PerfCountersMeasurement* const perf_counters_measurement_;
@@ -878,6 +969,23 @@
     return Ranges(ranges);
   }
 
+  // Have "setup" and/or "teardown" invoked once for every benchmark run.
+  // If the benchmark is multi-threaded (will run in k threads concurrently),
+  // the setup callback will be be invoked exactly once (not k times) before
+  // each run with k threads. Time allowing (e.g. for a short benchmark), there
+  // may be multiple such runs per benchmark, each run with its own
+  // "setup"/"teardown".
+  //
+  // If the benchmark uses different size groups of threads (e.g. via
+  // ThreadRange), the above will be true for each size group.
+  //
+  // The callback will be passed a State object, which includes the number
+  // of threads, thread-index, benchmark arguments, etc.
+  //
+  // The callback must not be NULL or self-deleting.
+  Benchmark* Setup(void (*setup)(const benchmark::State&));
+  Benchmark* Teardown(void (*teardown)(const benchmark::State&));
+
   // Pass this benchmark object to *func, which can customize
   // the benchmark by calling various methods like Arg, Args,
   // Threads, etc.
@@ -946,7 +1054,9 @@
   Benchmark* Complexity(BigOFunc* complexity);
 
   // Add this statistics to be computed over all the values of benchmark run
-  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics);
+  Benchmark* ComputeStatistics(const std::string& name,
+                               StatisticsFunc* statistics,
+                               StatisticUnit unit = kTime);
 
   // Support for running multiple copies of the same benchmark concurrently
   // in multiple threads.  This may be useful when measuring the scaling
@@ -1008,6 +1118,10 @@
   std::vector<Statistics> statistics_;
   std::vector<int> thread_counts_;
 
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_;
+  callback_function teardown_;
+
   Benchmark& operator=(Benchmark const&);
 };
 
@@ -1056,8 +1170,7 @@
 
   LambdaBenchmark(LambdaBenchmark const&) = delete;
 
- private:
-  template <class Lam>
+  template <class Lam>  // NOLINTNEXTLINE(readability-redundant-declaration)
   friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
 
   Lambda lambda_;
@@ -1131,22 +1244,37 @@
 #endif
 
 // Helpers for generating unique variable names
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_PRIVATE_NAME(...)                                      \
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, \
+                           __VA_ARGS__)
+#else
 #define BENCHMARK_PRIVATE_NAME(n) \
   BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#endif  // BENCHMARK_HAS_CXX11
+
 #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
 #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
 // Helper for concatenation with macro name expansion
 #define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
-    BaseClass##_##Method##_Benchmark
+  BaseClass##_##Method##_Benchmark
 
 #define BENCHMARK_PRIVATE_DECLARE(n)                                 \
   static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
       BENCHMARK_UNUSED
 
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK(...)                                               \
+  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =                           \
+      (::benchmark::internal::RegisterBenchmarkInternal(             \
+          new ::benchmark::internal::FunctionBenchmark(#__VA_ARGS__, \
+                                                       &__VA_ARGS__)))
+#else
 #define BENCHMARK(n)                                     \
   BENCHMARK_PRIVATE_DECLARE(n) =                         \
       (::benchmark::internal::RegisterBenchmarkInternal( \
           new ::benchmark::internal::FunctionBenchmark(#n, n)))
+#endif  // BENCHMARK_HAS_CXX11
 
 // Old-style macros
 #define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
@@ -1210,7 +1338,7 @@
 #define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)                  \
   class BaseClass##_##Method##_Benchmark : public BaseClass {           \
    public:                                                              \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {                  \
+    BaseClass##_##Method##_Benchmark() {                                \
       this->SetName(#BaseClass "/" #Method);                            \
     }                                                                   \
                                                                         \
@@ -1221,7 +1349,7 @@
 #define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a)     \
   class BaseClass##_##Method##_Benchmark : public BaseClass<a> {        \
    public:                                                              \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {               \
+    BaseClass##_##Method##_Benchmark() {                                \
       this->SetName(#BaseClass "<" #a ">/" #Method);                    \
     }                                                                   \
                                                                         \
@@ -1232,7 +1360,7 @@
 #define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b)  \
   class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {     \
    public:                                                              \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {            \
+    BaseClass##_##Method##_Benchmark() {                                \
       this->SetName(#BaseClass "<" #a "," #b ">/" #Method);             \
     }                                                                   \
                                                                         \
@@ -1244,7 +1372,7 @@
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
   class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
    public:                                                                 \
-    BaseClass##_##Method##_Benchmark() : BaseClass<__VA_ARGS__>() {        \
+    BaseClass##_##Method##_Benchmark() {                                   \
       this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method);             \
     }                                                                      \
                                                                            \
@@ -1334,11 +1462,7 @@
     int num_sharing;
   };
 
-  enum Scaling {
-    UNKNOWN,
-    ENABLED,
-    DISABLED
-  };
+  enum Scaling { UNKNOWN, ENABLED, DISABLED };
 
   int num_cpus;
   Scaling scaling;
@@ -1402,6 +1526,7 @@
 
     Run()
         : run_type(RT_Iteration),
+          aggregate_unit(kTime),
           error_occurred(false),
           iterations(1),
           threads(1),
@@ -1414,10 +1539,8 @@
           complexity_n(0),
           report_big_o(false),
           report_rms(false),
-          counters(),
-          has_memory_result(false),
-          allocs_per_iter(0.0),
-          max_bytes_used(0) {}
+          memory_result(NULL),
+          allocs_per_iter(0.0) {}
 
     std::string benchmark_name() const;
     BenchmarkName run_name;
@@ -1425,6 +1548,7 @@
     int64_t per_family_instance_index;
     RunType run_type;
     std::string aggregate_name;
+    StatisticUnit aggregate_unit;
     std::string report_label;  // Empty if not set by benchmark.
     bool error_occurred;
     std::string error_message;
@@ -1467,9 +1591,8 @@
     UserCounters counters;
 
     // Memory metrics.
-    bool has_memory_result;
+    const MemoryManager::Result* memory_result;
     double allocs_per_iter;
-    int64_t max_bytes_used;
   };
 
   struct PerFamilyRunReports {
@@ -1552,10 +1675,7 @@
     OO_Defaults = OO_ColorTabular
   };
   explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
-      : output_options_(opts_),
-        name_field_width_(0),
-        prev_counters_(),
-        printed_header_(false) {}
+      : output_options_(opts_), name_field_width_(0), printed_header_(false) {}
 
   virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
   virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
@@ -1598,29 +1718,6 @@
   std::set<std::string> user_counter_names_;
 };
 
-// If a MemoryManager is registered, it can be used to collect and report
-// allocation metrics for a run of the benchmark.
-class MemoryManager {
- public:
-  struct Result {
-    Result() : num_allocs(0), max_bytes_used(0) {}
-
-    // The number of allocations made in total between Start and Stop.
-    int64_t num_allocs;
-
-    // The peak memory use between Start and Stop.
-    int64_t max_bytes_used;
-  };
-
-  virtual ~MemoryManager() {}
-
-  // Implement this to start recording allocation information.
-  virtual void Start() = 0;
-
-  // Implement this to stop recording and fill out the given Result structure.
-  virtual void Stop(Result* result) = 0;
-};
-
 inline const char* GetTimeUnitString(TimeUnit unit) {
   switch (unit) {
     case kSecond:
@@ -1649,6 +1746,20 @@
   BENCHMARK_UNREACHABLE();
 }
 
+// Creates a list of integer values for the given range and multiplier.
+// This can be used together with ArgsProduct() to allow multiple ranges
+// with different multiplers.
+// Example:
+// ArgsProduct({
+//   CreateRange(0, 1024, /*multi=*/32),
+//   CreateRange(0, 100, /*multi=*/4),
+//   CreateDenseRange(0, 4, /*step=*/1),
+// });
+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi);
+
+// Creates a list of integer values for the given range and step.
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step);
+
 }  // namespace benchmark
 
 #endif  // BENCHMARK_BENCHMARK_H_
diff --git a/third-party/benchmark/requirements.txt b/third-party/benchmark/requirements.txt
--- a/third-party/benchmark/requirements.txt
+++ b/third-party/benchmark/requirements.txt
@@ -1,2 +1,3 @@
 numpy == 1.19.4
 scipy == 1.5.4
+pandas == 1.1.5
diff --git a/third-party/benchmark/setup.py b/third-party/benchmark/setup.py
--- a/third-party/benchmark/setup.py
+++ b/third-party/benchmark/setup.py
@@ -1,5 +1,6 @@
 import os
 import posixpath
+import platform
 import re
 import shutil
 import sys
@@ -89,6 +90,8 @@
             # Link with python*.lib.
             for library_dir in self.library_dirs:
                 bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+        elif sys.platform == "darwin" and platform.machine() == "x86_64":
+            bazel_argv.append("--macos_minimum_os=10.9")
 
         self.spawn(bazel_argv)
 
diff --git a/third-party/benchmark/src/CMakeLists.txt b/third-party/benchmark/src/CMakeLists.txt
--- a/third-party/benchmark/src/CMakeLists.txt
+++ b/third-party/benchmark/src/CMakeLists.txt
@@ -25,38 +25,32 @@
   SOVERSION ${GENERIC_LIB_SOVERSION}
 )
 target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)
 
 # libpfm, if available
 if (HAVE_LIBPFM)
-  target_link_libraries(benchmark libpfm.a)
+  target_link_libraries(benchmark PRIVATE pfm)
   add_definitions(-DHAVE_LIBPFM)
 endif()
 
 # Link threads.
-target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(benchmark ${LIBRT})
-endif()
+target_link_libraries(benchmark PRIVATE Threads::Threads)
+
+target_link_libraries(benchmark PRIVATE ${BENCHMARK_CXX_LIBRARIES})
+
+if(HAVE_LIB_RT)
+  target_link_libraries(benchmark PRIVATE rt)
+endif(HAVE_LIB_RT)
 
-if(CMAKE_BUILD_TYPE)
-  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
-endif()
-if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
-  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
-  target_link_libraries(benchmark -pthread)
-endif()
 
 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark shlwapi)
+  target_link_libraries(benchmark PRIVATE shlwapi)
 endif()
 
 # We need extra libraries on Solaris
 if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
-  target_link_libraries(benchmark kstat)
+  target_link_libraries(benchmark PRIVATE kstat)
 endif()
 
 # Benchmark main library
@@ -67,33 +61,44 @@
   VERSION ${GENERIC_LIB_VERSION}
   SOVERSION ${GENERIC_LIB_SOVERSION}
 )
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
-target_link_libraries(benchmark_main benchmark::benchmark)
+target_link_libraries(benchmark_main PUBLIC benchmark::benchmark)
 
 
-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+set(generated_dir "${PROJECT_BINARY_DIR}")
 
 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")
 
 set(namespace "${PROJECT_NAME}::")
 
 include(CMakePackageConfigHelpers)
+
+configure_package_config_file (
+  ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in
+  ${project_config}
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  NO_SET_AND_CHECK_MACRO
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
 write_basic_package_version_file(
   "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
 )
 
-configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
 
+export (
+  TARGETS ${targets_to_export}
+  NAMESPACE "${namespace}"
+  FILE ${generated_dir}/${targets_export_name}.cmake
+)
+
 if (BENCHMARK_ENABLE_INSTALL)
   # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
   install(
-    TARGETS benchmark benchmark_main
+    TARGETS ${targets_to_export}
     EXPORT ${targets_export_name}
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -118,3 +123,37 @@
       NAMESPACE "${namespace}"
       DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 endif()
+
+if (BENCHMARK_ENABLE_DOXYGEN)
+  find_package(Doxygen REQUIRED)
+  set(DOXYGEN_QUIET YES)
+  set(DOXYGEN_RECURSIVE YES)
+  set(DOXYGEN_GENERATE_HTML YES)
+  set(DOXYGEN_GENERATE_MAN NO)
+  set(DOXYGEN_MARKDOWN_SUPPORT YES)
+  set(DOXYGEN_BUILTIN_STL_SUPPORT YES)
+  set(DOXYGEN_EXTRACT_PACKAGE YES)
+  set(DOXYGEN_EXTRACT_STATIC YES)
+  set(DOXYGEN_SHOW_INCLUDE_FILES YES)
+  set(DOXYGEN_BINARY_TOC YES)
+  set(DOXYGEN_TOC_EXPAND YES)
+  set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "index.md")
+  doxygen_add_docs(benchmark_doxygen
+    docs
+    include
+    src
+    ALL
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    COMMENT "Building documentation with Doxygen.")
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+else()
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${PROJECT_SOURCE_DIR}/docs/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+endif()
\ No newline at end of file
diff --git a/third-party/benchmark/src/benchmark.cc b/third-party/benchmark/src/benchmark.cc
--- a/third-party/benchmark/src/benchmark.cc
+++ b/third-party/benchmark/src/benchmark.cc
@@ -56,75 +56,75 @@
 #include "thread_manager.h"
 #include "thread_timer.h"
 
+namespace benchmark {
 // Print a list of benchmarks. This option overrides all other options.
-DEFINE_bool(benchmark_list_tests, false);
+BM_DEFINE_bool(benchmark_list_tests, false);
 
 // A regular expression that specifies the set of benchmarks to execute.  If
 // this flag is empty, or if this flag is the string \"all\", all benchmarks
 // linked into the binary are run.
-DEFINE_string(benchmark_filter, ".");
+BM_DEFINE_string(benchmark_filter, "");
 
 // Minimum number of seconds we should run benchmark before results are
 // considered significant.  For cpu-time based tests, this is the lower bound
 // on the total cpu time used by all threads that make up the test.  For
 // real-time based tests, this is the lower bound on the elapsed time of the
 // benchmark execution, regardless of number of threads.
-DEFINE_double(benchmark_min_time, 0.5);
+BM_DEFINE_double(benchmark_min_time, 0.5);
 
 // The number of runs of each benchmark. If greater than 1, the mean and
 // standard deviation of the runs will be reported.
-DEFINE_int32(benchmark_repetitions, 1);
+BM_DEFINE_int32(benchmark_repetitions, 1);
 
 // If set, enable random interleaving of repetitions of all benchmarks.
 // See http://github.com/google/benchmark/issues/1051 for details.
-DEFINE_bool(benchmark_enable_random_interleaving, false);
+BM_DEFINE_bool(benchmark_enable_random_interleaving, false);
 
 // Report the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are reported for
 // repeated benchmarks. Affects all reporters.
-DEFINE_bool(benchmark_report_aggregates_only, false);
+BM_DEFINE_bool(benchmark_report_aggregates_only, false);
 
 // Display the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are displayed for
 // repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
 // the display reporter, but  *NOT* file reporter, which will still contain
 // all the output.
-DEFINE_bool(benchmark_display_aggregates_only, false);
+BM_DEFINE_bool(benchmark_display_aggregates_only, false);
 
 // The format to use for console output.
 // Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_format, "console");
+BM_DEFINE_string(benchmark_format, "console");
 
 // The format to use for file output.
 // Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_out_format, "json");
+BM_DEFINE_string(benchmark_out_format, "json");
 
 // The file to write additional output to.
-DEFINE_string(benchmark_out, "");
+BM_DEFINE_string(benchmark_out, "");
 
 // Whether to use colors in the output.  Valid values:
 // 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
 // the output is being sent to a terminal and the TERM environment variable is
 // set to a terminal type that supports colors.
-DEFINE_string(benchmark_color, "auto");
+BM_DEFINE_string(benchmark_color, "auto");
 
 // Whether to use tabular format when printing user counters to the console.
 // Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
-DEFINE_bool(benchmark_counters_tabular, false);
-
-// The level of verbose logging to output
-DEFINE_int32(v, 0);
+BM_DEFINE_bool(benchmark_counters_tabular, false);
 
 // List of additional perf counters to collect, in libpfm format. For more
 // information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html
-DEFINE_string(benchmark_perf_counters, "");
-
-namespace benchmark {
-namespace internal {
+BM_DEFINE_string(benchmark_perf_counters, "");
 
 // Extra context to include in the output formatted as comma-separated key-value
 // pairs. Kept internal as it's only used for parsing from env/command line.
-DEFINE_kvpairs(benchmark_context, {});
+BM_DEFINE_kvpairs(benchmark_context, {});
+
+// The level of verbose logging to output
+BM_DEFINE_int32(v, 0);
+
+namespace internal {
 
 std::map<std::string, std::string>* global_context = nullptr;
 
@@ -145,14 +145,14 @@
       error_occurred_(false),
       range_(ranges),
       complexity_n_(0),
-      counters(),
-      thread_index(thread_i),
-      threads(n_threads),
+      thread_index_(thread_i),
+      threads_(n_threads),
       timer_(timer),
       manager_(manager),
       perf_counters_measurement_(perf_counters_measurement) {
-  CHECK(max_iterations != 0) << "At least one iteration must be run";
-  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+  BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
+  BM_CHECK_LT(thread_index_, threads_)
+      << "thread_index must be less than threads";
 
   // Note: The use of offsetof below is technically undefined until C++17
   // because State is not a standard layout type. However, all compilers
@@ -181,21 +181,21 @@
 
 void State::PauseTiming() {
   // Add in time accumulated so far
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !error_occurred_);
   timer_->StopTimer();
   if (perf_counters_measurement_) {
     auto measurements = perf_counters_measurement_->StopAndGetMeasurements();
     for (const auto& name_and_measurement : measurements) {
       auto name = name_and_measurement.first;
       auto measurement = name_and_measurement.second;
-      CHECK_EQ(counters[name], 0.0);
+      BM_CHECK_EQ(counters[name], 0.0);
       counters[name] = Counter(measurement, Counter::kAvgIterations);
     }
   }
 }
 
 void State::ResumeTiming() {
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !error_occurred_);
   timer_->StartTimer();
   if (perf_counters_measurement_) {
     perf_counters_measurement_->Start();
@@ -203,7 +203,7 @@
 }
 
 void State::SkipWithError(const char* msg) {
-  CHECK(msg);
+  BM_CHECK(msg);
   error_occurred_ = true;
   {
     MutexLock l(manager_->GetBenchmarkMutex());
@@ -226,7 +226,7 @@
 }
 
 void State::StartKeepRunning() {
-  CHECK(!started_ && !finished_);
+  BM_CHECK(!started_ && !finished_);
   started_ = true;
   total_iterations_ = error_occurred_ ? 0 : max_iterations;
   manager_->StartStopBarrier();
@@ -234,7 +234,7 @@
 }
 
 void State::FinishKeepRunning() {
-  CHECK(started_ && (!finished_ || error_occurred_));
+  BM_CHECK(started_ && (!finished_ || error_occurred_));
   if (!error_occurred_) {
     PauseTiming();
   }
@@ -282,7 +282,7 @@
                    BenchmarkReporter* display_reporter,
                    BenchmarkReporter* file_reporter) {
   // Note the file_reporter can be null.
-  CHECK(display_reporter != nullptr);
+  BM_CHECK(display_reporter != nullptr);
 
   // Determine the width of the name field using a minimum width of 10.
   bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
@@ -328,7 +328,7 @@
     }
     assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
 
-    std::vector<int> repetition_indices;
+    std::vector<size_t> repetition_indices;
     repetition_indices.reserve(num_repetitions_total);
     for (size_t runner_index = 0, num_runners = runners.size();
          runner_index != num_runners; ++runner_index) {
@@ -362,7 +362,7 @@
                                              additional_run_stats.begin(),
                                              additional_run_stats.end());
           per_family_reports.erase(
-              (int)reports_for_family->Runs.front().family_index);
+              static_cast<int>(reports_for_family->Runs.front().family_index));
         }
       }
 
@@ -377,10 +377,7 @@
 
 // Disable deprecated warnings temporarily because we need to reference
 // CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
 
 std::unique_ptr<BenchmarkReporter> CreateReporter(
     std::string const& name, ConsoleReporter::OutputOptions output_opts) {
@@ -397,9 +394,7 @@
   }
 }
 
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
+BENCHMARK_RESTORE_DEPRECATED_WARNING
 
 }  // end namespace
 
@@ -434,16 +429,32 @@
 }  // end namespace internal
 
 size_t RunSpecifiedBenchmarks() {
-  return RunSpecifiedBenchmarks(nullptr, nullptr);
+  return RunSpecifiedBenchmarks(nullptr, nullptr, FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(std::string spec) {
+  return RunSpecifiedBenchmarks(nullptr, nullptr, std::move(spec));
 }
 
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
-  return RunSpecifiedBenchmarks(display_reporter, nullptr);
+  return RunSpecifiedBenchmarks(display_reporter, nullptr,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              std::string spec) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr, std::move(spec));
 }
 
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                               BenchmarkReporter* file_reporter) {
-  std::string spec = FLAGS_benchmark_filter;
+  return RunSpecifiedBenchmarks(display_reporter, file_reporter,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter,
+                              std::string spec) {
   if (spec.empty() || spec == "all")
     spec = ".";  // Regexp that matches all benchmarks
 
@@ -499,6 +510,8 @@
   return benchmarks.size();
 }
 
+std::string GetBenchmarkFilter() { return FLAGS_benchmark_filter; }
+
 void RegisterMemoryManager(MemoryManager* manager) {
   internal::memory_manager = manager;
 }
@@ -530,6 +543,7 @@
           "          [--benchmark_out_format=<json|console|csv>]\n"
           "          [--benchmark_color={auto|true|false}]\n"
           "          [--benchmark_counters_tabular={true|false}]\n"
+          "          [--benchmark_perf_counters=<counter>,...]\n"
           "          [--benchmark_context=<key>=<value>,...]\n"
           "          [--v=<verbosity>]\n");
   exit(0);
@@ -558,9 +572,6 @@
         ParseStringFlag(argv[i], "benchmark_out_format",
                         &FLAGS_benchmark_out_format) ||
         ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
-        // "color_print" is the deprecated name for "benchmark_color".
-        // TODO: Remove this.
-        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
         ParseBoolFlag(argv[i], "benchmark_counters_tabular",
                       &FLAGS_benchmark_counters_tabular) ||
         ParseStringFlag(argv[i], "benchmark_perf_counters",
@@ -602,9 +613,7 @@
   internal::LogLevel() = FLAGS_v;
 }
 
-void Shutdown() {
-  delete internal::global_context;
-}
+void Shutdown() { delete internal::global_context; }
 
 bool ReportUnrecognizedArguments(int argc, char** argv) {
   for (int i = 1; i < argc; ++i) {
diff --git a/third-party/benchmark/src/benchmark_api_internal.h b/third-party/benchmark/src/benchmark_api_internal.h
--- a/third-party/benchmark/src/benchmark_api_internal.h
+++ b/third-party/benchmark/src/benchmark_api_internal.h
@@ -32,12 +32,14 @@
   bool use_real_time() const { return use_real_time_; }
   bool use_manual_time() const { return use_manual_time_; }
   BigO complexity() const { return complexity_; }
-  BigOFunc& complexity_lambda() const { return *complexity_lambda_; }
+  BigOFunc* complexity_lambda() const { return complexity_lambda_; }
   const std::vector<Statistics>& statistics() const { return statistics_; }
   int repetitions() const { return repetitions_; }
   double min_time() const { return min_time_; }
   IterationCount iterations() const { return iterations_; }
   int threads() const { return threads_; }
+  void Setup() const;
+  void Teardown() const;
 
   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
             internal::ThreadManager* manager,
@@ -62,6 +64,10 @@
   double min_time_;
   IterationCount iterations_;
   int threads_;  // Number of concurrent threads to us
+
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_ = nullptr;
+  callback_function teardown_ = nullptr;
 };
 
 bool FindBenchmarksInternal(const std::string& re,
diff --git a/third-party/benchmark/src/benchmark_api_internal.cc b/third-party/benchmark/src/benchmark_api_internal.cc
--- a/third-party/benchmark/src/benchmark_api_internal.cc
+++ b/third-party/benchmark/src/benchmark_api_internal.cc
@@ -78,6 +78,9 @@
   if (!benchmark_.thread_counts_.empty()) {
     name_.threads = StrFormat("threads:%d", threads_);
   }
+
+  setup_ = benchmark_.setup_;
+  teardown_ = benchmark_.teardown_;
 }
 
 State BenchmarkInstance::Run(
@@ -90,5 +93,20 @@
   return st;
 }
 
+void BenchmarkInstance::Setup() const {
+  if (setup_) {
+    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
+             nullptr);
+    setup_(st);
+  }
+}
+
+void BenchmarkInstance::Teardown() const {
+  if (teardown_) {
+    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
+             nullptr);
+    teardown_(st);
+  }
+}
 }  // namespace internal
 }  // namespace benchmark
diff --git a/third-party/benchmark/src/benchmark_register.h b/third-party/benchmark/src/benchmark_register.h
--- a/third-party/benchmark/src/benchmark_register.h
+++ b/third-party/benchmark/src/benchmark_register.h
@@ -12,11 +12,11 @@
 // Append the powers of 'mult' in the closed interval [lo, hi].
 // Returns iterator to the start of the inserted range.
 template <typename T>
-typename std::vector<T>::iterator
-AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
+                                            int mult) {
+  BM_CHECK_GE(lo, 0);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
 
   const size_t start_offset = dst->size();
 
@@ -38,10 +38,10 @@
 template <typename T>
 void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
   // We negate lo and hi so we require that they cannot be equal to 'min'.
-  CHECK_GT(lo, std::numeric_limits<T>::min());
-  CHECK_GT(hi, std::numeric_limits<T>::min());
-  CHECK_GE(hi, lo);
-  CHECK_LE(hi, 0);
+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_LE(hi, 0);
 
   // Add positive powers, then negate and reverse.
   // Casts necessary since small integers get promoted
@@ -60,8 +60,8 @@
   static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
                 "Args type must be a signed integer");
 
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
 
   // Add "lo"
   dst->push_back(lo);
diff --git a/third-party/benchmark/src/benchmark_register.cc b/third-party/benchmark/src/benchmark_register.cc
--- a/third-party/benchmark/src/benchmark_register.cc
+++ b/third-party/benchmark/src/benchmark_register.cc
@@ -111,7 +111,7 @@
 bool BenchmarkFamilies::FindBenchmarks(
     std::string spec, std::vector<BenchmarkInstance>* benchmarks,
     std::ostream* ErrStream) {
-  CHECK(ErrStream);
+  BM_CHECK(ErrStream);
   auto& Err = *ErrStream;
   // Make regular expression out of command-line flag
   std::string error_msg;
@@ -211,10 +211,13 @@
       use_real_time_(false),
       use_manual_time_(false),
       complexity_(oNone),
-      complexity_lambda_(nullptr) {
+      complexity_lambda_(nullptr),
+      setup_(nullptr),
+      teardown_(nullptr) {
   ComputeStatistics("mean", StatisticsMean);
   ComputeStatistics("median", StatisticsMedian);
   ComputeStatistics("stddev", StatisticsStdDev);
+  ComputeStatistics("cv", StatisticsCV, kPercentage);
 }
 
 Benchmark::~Benchmark() {}
@@ -225,7 +228,7 @@
 }
 
 Benchmark* Benchmark::Arg(int64_t x) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   args_.push_back({x});
   return this;
 }
@@ -236,7 +239,7 @@
 }
 
 Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   std::vector<int64_t> arglist;
   AddRange(&arglist, start, limit, range_multiplier_);
 
@@ -248,7 +251,7 @@
 
 Benchmark* Benchmark::Ranges(
     const std::vector<std::pair<int64_t, int64_t>>& ranges) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
   std::vector<std::vector<int64_t>> arglists(ranges.size());
   for (std::size_t i = 0; i < ranges.size(); i++) {
     AddRange(&arglists[i], ranges[i].first, ranges[i].second,
@@ -262,7 +265,7 @@
 
 Benchmark* Benchmark::ArgsProduct(
     const std::vector<std::vector<int64_t>>& arglists) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
 
   std::vector<std::size_t> indices(arglists.size());
   const std::size_t total = std::accumulate(
@@ -289,20 +292,20 @@
 }
 
 Benchmark* Benchmark::ArgName(const std::string& name) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   arg_names_ = {name};
   return this;
 }
 
 Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
   arg_names_ = names;
   return this;
 }
 
 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_LE(start, limit);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK_LE(start, limit);
   for (int64_t arg = start; arg <= limit; arg += step) {
     args_.push_back({arg});
   }
@@ -310,7 +313,7 @@
 }
 
 Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
   args_.push_back(args);
   return this;
 }
@@ -320,28 +323,40 @@
   return this;
 }
 
+Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+  BM_CHECK(setup != nullptr);
+  setup_ = setup;
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = teardown;
+  return this;
+}
+
 Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
+  BM_CHECK(multiplier > 1);
   range_multiplier_ = multiplier;
   return this;
 }
 
 Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  CHECK(iterations_ == 0);
+  BM_CHECK(t > 0.0);
+  BM_CHECK(iterations_ == 0);
   min_time_ = t;
   return this;
 }
 
 Benchmark* Benchmark::Iterations(IterationCount n) {
-  CHECK(n > 0);
-  CHECK(IsZero(min_time_));
+  BM_CHECK(n > 0);
+  BM_CHECK(IsZero(min_time_));
   iterations_ = n;
   return this;
 }
 
 Benchmark* Benchmark::Repetitions(int n) {
-  CHECK(n > 0);
+  BM_CHECK(n > 0);
   repetitions_ = n;
   return this;
 }
@@ -374,14 +389,14 @@
 }
 
 Benchmark* Benchmark::UseRealTime() {
-  CHECK(!use_manual_time_)
+  BM_CHECK(!use_manual_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
   use_real_time_ = true;
   return this;
 }
 
 Benchmark* Benchmark::UseManualTime() {
-  CHECK(!use_real_time_)
+  BM_CHECK(!use_real_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
   use_manual_time_ = true;
   return this;
@@ -398,21 +413,22 @@
   return this;
 }
 
-Benchmark* Benchmark::ComputeStatistics(std::string name,
-                                        StatisticsFunc* statistics) {
-  statistics_.emplace_back(name, statistics);
+Benchmark* Benchmark::ComputeStatistics(const std::string& name,
+                                        StatisticsFunc* statistics,
+                                        StatisticUnit unit) {
+  statistics_.emplace_back(name, statistics, unit);
   return this;
 }
 
 Benchmark* Benchmark::Threads(int t) {
-  CHECK_GT(t, 0);
+  BM_CHECK_GT(t, 0);
   thread_counts_.push_back(t);
   return this;
 }
 
 Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
 
   AddRange(&thread_counts_, min_threads, max_threads, 2);
   return this;
@@ -420,9 +436,9 @@
 
 Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
                                        int stride) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-  CHECK_GE(stride, 1);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GE(stride, 1);
 
   for (auto i = min_threads; i < max_threads; i += stride) {
     thread_counts_.push_back(i);
@@ -458,4 +474,19 @@
   internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
 }
 
+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi) {
+  std::vector<int64_t> args;
+  internal::AddRange(&args, lo, hi, multi);
+  return args;
+}
+
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step) {
+  BM_CHECK_LE(start, limit);
+  std::vector<int64_t> args;
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args.push_back(arg);
+  }
+  return args;
+}
+
 }  // end namespace benchmark
diff --git a/third-party/benchmark/src/benchmark_runner.h b/third-party/benchmark/src/benchmark_runner.h
--- a/third-party/benchmark/src/benchmark_runner.h
+++ b/third-party/benchmark/src/benchmark_runner.h
@@ -23,18 +23,14 @@
 #include "perf_counters.h"
 #include "thread_manager.h"
 
-DECLARE_double(benchmark_min_time);
-
-DECLARE_int32(benchmark_repetitions);
-
-DECLARE_bool(benchmark_report_aggregates_only);
-
-DECLARE_bool(benchmark_display_aggregates_only);
-
-DECLARE_string(benchmark_perf_counters);
-
 namespace benchmark {
 
+BM_DECLARE_double(benchmark_min_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {
 
 extern MemoryManager* memory_manager;
@@ -64,7 +60,7 @@
 
   BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
     return reports_for_family;
-  };
+  }
 
  private:
   RunResults run_results;
@@ -80,6 +76,8 @@
 
   std::vector<std::thread> pool;
 
+  std::vector<MemoryManager::Result> memory_results;
+
   IterationCount iters;  // preserved between repetitions!
   // So only the first repetition has to find/calculate it,
   // the other repetitions will just use that precomputed iteration count.
diff --git a/third-party/benchmark/src/benchmark_runner.cc b/third-party/benchmark/src/benchmark_runner.cc
--- a/third-party/benchmark/src/benchmark_runner.cc
+++ b/third-party/benchmark/src/benchmark_runner.cc
@@ -67,7 +67,7 @@
     const benchmark::internal::BenchmarkInstance& b,
     const internal::ThreadManager::Result& results,
     IterationCount memory_iterations,
-    const MemoryManager::Result& memory_result, double seconds,
+    const MemoryManager::Result* memory_result, double seconds,
     int64_t repetition_index, int64_t repeats) {
   // Create report about this benchmark run.
   BenchmarkReporter::Run report;
@@ -99,12 +99,12 @@
     report.counters = results.counters;
 
     if (memory_iterations > 0) {
-      report.has_memory_result = true;
+      assert(memory_result != nullptr);
+      report.memory_result = memory_result;
       report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+          memory_iterations ? static_cast<double>(memory_result->num_allocs) /
                                   memory_iterations
                             : 0;
-      report.max_bytes_used = memory_result.max_bytes_used;
     }
 
     internal::Finish(&report.counters, results.iterations, seconds,
@@ -124,7 +124,7 @@
           : internal::ThreadTimer::Create());
   State st =
       b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
-  CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
+  BM_CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
       << "Benchmark returned before State::KeepRunning() returned false!";
   {
     MutexLock l(manager->GetBenchmarkMutex());
@@ -168,14 +168,14 @@
          internal::ARM_DisplayReportAggregatesOnly);
     run_results.file_report_aggregates_only =
         (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
-    CHECK(FLAGS_benchmark_perf_counters.empty() ||
-          perf_counters_measurement.IsValid())
+    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
+             perf_counters_measurement.IsValid())
         << "Perf counters were requested but could not be set up.";
   }
 }
 
 BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
-  VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
+  BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
 
   std::unique_ptr<internal::ThreadManager> manager;
   manager.reset(new internal::ThreadManager(b.threads()));
@@ -210,8 +210,8 @@
   // If we were measuring whole-process CPU usage, adjust the CPU time too.
   if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
 
-  VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
-          << i.results.real_time_used << "\n";
+  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+             << i.results.real_time_used << "\n";
 
   // By using KeepRunningBatch a benchmark can iterate more times than
   // requested, so take the iteration count from i.results.
@@ -239,8 +239,7 @@
   // NOTE: When the last run was at least 10% of the min time the max
   // expansion should be 14x.
   bool is_significant = (i.seconds / min_time) > 0.1;
-  multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-  if (multiplier <= 1.0) multiplier = 2.0;
+  multiplier = is_significant ? multiplier : 10.0;
 
   // So what seems to be the sufficiently-large iteration count? Round up.
   const IterationCount max_next_iters = static_cast<IterationCount>(
@@ -249,7 +248,7 @@
   // But we do have *some* sanity limits though..
   const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
 
-  VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+  BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
   return next_iters;  // round up before conversion to integer.
 }
 
@@ -280,7 +279,9 @@
   // is *only* calculated for the *first* repetition, and other repetitions
   // simply use that precomputed iteration count.
   for (;;) {
+    b.Setup();
     i = DoNIterations();
+    b.Teardown();
 
     // Do we consider the results to be significant?
     // If we are doing repetitions, and the first repetition was already done,
@@ -303,24 +304,33 @@
   }
 
   // Oh, one last thing, we need to also produce the 'memory measurements'..
-  MemoryManager::Result memory_result;
+  MemoryManager::Result* memory_result = nullptr;
   IterationCount memory_iterations = 0;
   if (memory_manager != nullptr) {
+    // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
+    // optional so we don't have to own the Result here.
+    // Can't do it now due to cxx03.
+    memory_results.push_back(MemoryManager::Result());
+    memory_result = &memory_results.back();
     // Only run a few iterations to reduce the impact of one-time
     // allocations in benchmarks that are not properly managed.
     memory_iterations = std::min<IterationCount>(16, iters);
     memory_manager->Start();
     std::unique_ptr<internal::ThreadManager> manager;
     manager.reset(new internal::ThreadManager(1));
+    b.Setup();
     RunInThread(&b, memory_iterations, 0, manager.get(),
                 perf_counters_measurement_ptr);
     manager->WaitForAllThreads();
     manager.reset();
+    b.Teardown();
 
-    memory_manager->Stop(&memory_result);
+    BENCHMARK_DISABLE_DEPRECATED_WARNING
+    memory_manager->Stop(memory_result);
+    BENCHMARK_RESTORE_DEPRECATED_WARNING
   }
 
-  // Ok, now actualy report.
+  // Ok, now actually report.
   BenchmarkReporter::Run report =
       CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
                       num_repetitions_done, repeats);
diff --git a/third-party/benchmark/src/check.h b/third-party/benchmark/src/check.h
--- a/third-party/benchmark/src/check.h
+++ b/third-party/benchmark/src/check.h
@@ -23,8 +23,9 @@
   std::abort();  // fallback to enforce noreturn
 }
 
-// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
-// will log information about the failures and abort when it is destructed.
+// CheckHandler is the class constructed by failing BM_CHECK macros.
+// CheckHandler will log information about the failures and abort when it is
+// destructed.
 class CheckHandler {
  public:
   CheckHandler(const char* check, const char* file, const char* func, int line)
@@ -51,32 +52,32 @@
 }  // end namespace internal
 }  // end namespace benchmark
 
-// The CHECK macro returns a std::ostream object that can have extra information
-// written to it.
+// The BM_CHECK macro returns a std::ostream object that can have extra
+// information written to it.
 #ifndef NDEBUG
-#define CHECK(b)                                                             \
+#define BM_CHECK(b)                                                          \
   (b ? ::benchmark::internal::GetNullLogInstance()                           \
      : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
            .GetLog())
 #else
-#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif
 
 // clang-format off
 // preserve whitespacing between operators for alignment
-#define CHECK_EQ(a, b) CHECK((a) == (b))
-#define CHECK_NE(a, b) CHECK((a) != (b))
-#define CHECK_GE(a, b) CHECK((a) >= (b))
-#define CHECK_LE(a, b) CHECK((a) <= (b))
-#define CHECK_GT(a, b) CHECK((a) > (b))
-#define CHECK_LT(a, b) CHECK((a) < (b))
-
-#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
-#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
-#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
-#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
-#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
-#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+#define BM_CHECK_EQ(a, b) BM_CHECK((a) == (b))
+#define BM_CHECK_NE(a, b) BM_CHECK((a) != (b))
+#define BM_CHECK_GE(a, b) BM_CHECK((a) >= (b))
+#define BM_CHECK_LE(a, b) BM_CHECK((a) <= (b))
+#define BM_CHECK_GT(a, b) BM_CHECK((a) > (b))
+#define BM_CHECK_LT(a, b) BM_CHECK((a) < (b))
+
+#define BM_CHECK_FLOAT_EQ(a, b, eps) BM_CHECK(std::fabs((a) - (b)) <  (eps))
+#define BM_CHECK_FLOAT_NE(a, b, eps) BM_CHECK(std::fabs((a) - (b)) >= (eps))
+#define BM_CHECK_FLOAT_GE(a, b, eps) BM_CHECK((a) - (b) > -(eps))
+#define BM_CHECK_FLOAT_LE(a, b, eps) BM_CHECK((b) - (a) > -(eps))
+#define BM_CHECK_FLOAT_GT(a, b, eps) BM_CHECK((a) - (b) >  (eps))
+#define BM_CHECK_FLOAT_LT(a, b, eps) BM_CHECK((b) - (a) >  (eps))
 //clang-format on
 
 #endif  // CHECK_H_
diff --git a/third-party/benchmark/src/colorprint.cc b/third-party/benchmark/src/colorprint.cc
--- a/third-party/benchmark/src/colorprint.cc
+++ b/third-party/benchmark/src/colorprint.cc
@@ -25,8 +25,8 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
 #include <io.h>
+#include <windows.h>
 #else
 #include <unistd.h>
 #endif  // BENCHMARK_OS_WINDOWS
@@ -94,7 +94,7 @@
   va_end(args_cp);
 
   // currently there is no error handling for failure, so this is hack.
-  CHECK(ret >= 0);
+  BM_CHECK(ret >= 0);
 
   if (ret == 0)  // handle empty expansion
     return {};
@@ -102,10 +102,10 @@
     return local_buff;
   else {
     // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1;  // + 1 for the null byte
+    size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
     std::unique_ptr<char[]> buff(new char[size]);
     ret = vsnprintf(buff.get(), size, msg, args);
-    CHECK(ret > 0 && ((size_t)ret) < size);
+    BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
     return buff.get();
   }
 }
diff --git a/third-party/benchmark/src/commandlineflags.h b/third-party/benchmark/src/commandlineflags.h
--- a/third-party/benchmark/src/commandlineflags.h
+++ b/third-party/benchmark/src/commandlineflags.h
@@ -9,23 +9,23 @@
 #define FLAG(name) FLAGS_##name
 
 // Macros for declaring flags.
-#define DECLARE_bool(name) extern bool FLAG(name)
-#define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_double(name) extern double FLAG(name)
-#define DECLARE_string(name) extern std::string FLAG(name)
-#define DECLARE_kvpairs(name) \
+#define BM_DECLARE_bool(name) extern bool FLAG(name)
+#define BM_DECLARE_int32(name) extern int32_t FLAG(name)
+#define BM_DECLARE_double(name) extern double FLAG(name)
+#define BM_DECLARE_string(name) extern std::string FLAG(name)
+#define BM_DECLARE_kvpairs(name) \
   extern std::map<std::string, std::string> FLAG(name)
 
 // Macros for defining flags.
-#define DEFINE_bool(name, default_val) \
+#define BM_DEFINE_bool(name, default_val) \
   bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
-#define DEFINE_int32(name, default_val) \
+#define BM_DEFINE_int32(name, default_val) \
   int32_t FLAG(name) = benchmark::Int32FromEnv(#name, default_val)
-#define DEFINE_double(name, default_val) \
+#define BM_DEFINE_double(name, default_val) \
   double FLAG(name) = benchmark::DoubleFromEnv(#name, default_val)
-#define DEFINE_string(name, default_val) \
+#define BM_DEFINE_string(name, default_val) \
   std::string FLAG(name) = benchmark::StringFromEnv(#name, default_val)
-#define DEFINE_kvpairs(name, default_val)         \
+#define BM_DEFINE_kvpairs(name, default_val)      \
   std::map<std::string, std::string> FLAG(name) = \
       benchmark::KvPairsFromEnv(#name, default_val)
 
diff --git a/third-party/benchmark/src/commandlineflags.cc b/third-party/benchmark/src/commandlineflags.cc
--- a/third-party/benchmark/src/commandlineflags.cc
+++ b/third-party/benchmark/src/commandlineflags.cc
@@ -248,9 +248,8 @@
   return true;
 }
 
-bool ParseKeyValueFlag(
-    const char* str, const char* flag,
-    std::map<std::string, std::string>* value) {
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value) {
   const char* const value_str = ParseFlagValue(str, flag, false);
 
   if (value_str == nullptr) return false;
diff --git a/third-party/benchmark/src/complexity.cc b/third-party/benchmark/src/complexity.cc
--- a/third-party/benchmark/src/complexity.cc
+++ b/third-party/benchmark/src/complexity.cc
@@ -15,12 +15,13 @@
 // Source project : https://github.com/ismaelJimenez/cpp.leastsq
 // Adapted to be used with google benchmark
 
-#include "benchmark/benchmark.h"
+#include "complexity.h"
 
 #include <algorithm>
 #include <cmath>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "complexity.h"
 
 namespace benchmark {
 
@@ -123,10 +124,10 @@
 //                  fitting curve.
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time, const BigO complexity) {
-  CHECK_EQ(n.size(), time.size());
-  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-                          // benchmark runs are given
-  CHECK_NE(complexity, oNone);
+  BM_CHECK_EQ(n.size(), time.size());
+  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                             // benchmark runs are given
+  BM_CHECK_NE(complexity, oNone);
 
   LeastSq best_fit;
 
@@ -167,7 +168,8 @@
 
   // Populate the accumulators.
   for (const Run& run : reports) {
-    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    BM_CHECK_GT(run.complexity_n, 0)
+        << "Did you forget to call SetComplexityN?";
     n.push_back(run.complexity_n);
     real_time.push_back(run.real_accumulated_time / run.iterations);
     cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
@@ -198,6 +200,7 @@
   big_o.repetition_index = Run::no_repetition_index;
   big_o.threads = reports[0].threads;
   big_o.aggregate_name = "BigO";
+  big_o.aggregate_unit = StatisticUnit::kTime;
   big_o.report_label = reports[0].report_label;
   big_o.iterations = 0;
   big_o.real_accumulated_time = result_real.coef;
@@ -219,6 +222,7 @@
   rms.per_family_instance_index = reports[0].per_family_instance_index;
   rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
   rms.aggregate_name = "RMS";
+  rms.aggregate_unit = StatisticUnit::kPercentage;
   rms.report_label = big_o.report_label;
   rms.iterations = 0;
   rms.repetition_index = Run::no_repetition_index;
diff --git a/third-party/benchmark/src/console_reporter.cc b/third-party/benchmark/src/console_reporter.cc
--- a/third-party/benchmark/src/console_reporter.cc
+++ b/third-party/benchmark/src/console_reporter.cc
@@ -45,7 +45,7 @@
     GetErrorStream()
         << "Color printing is only supported for stdout on windows."
            " Disabling color printing\n";
-    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
+    output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
   }
 #endif
 
@@ -53,11 +53,12 @@
 }
 
 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
-                                 "Benchmark", "Time", "CPU", "Iterations");
-  if(!run.counters.empty()) {
-    if(output_options_ & OO_Tabular) {
-      for(auto const& c : run.counters) {
+  std::string str =
+      FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
+                   "Benchmark", "Time", "CPU", "Iterations");
+  if (!run.counters.empty()) {
+    if (output_options_ & OO_Tabular) {
+      for (auto const& c : run.counters) {
         str += FormatString(" %10s", c.first.c_str());
       }
     } else {
@@ -97,7 +98,6 @@
   va_end(args);
 }
 
-
 static std::string FormatTime(double time) {
   // Align decimal places...
   if (time < 1.0) {
@@ -115,8 +115,9 @@
 void ConsoleReporter::PrintRunData(const Run& result) {
   typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color) ?
-                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
+  PrinterFn* printer = (output_options_ & OO_Color)
+                           ? static_cast<PrinterFn*>(ColorPrintf)
+                           : IgnoreColorPrint;
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
   printer(Out, name_color, "%-*s ", name_field_width_,
@@ -134,18 +135,23 @@
   const std::string real_time_str = FormatTime(real_time);
   const std::string cpu_time_str = FormatTime(cpu_time);
 
-
   if (result.report_big_o) {
     std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
-            cpu_time, big_o.c_str());
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time,
+            big_o.c_str(), cpu_time, big_o.c_str());
   } else if (result.report_rms) {
     printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
             cpu_time * 100, "%");
-  } else {
+  } else if (result.run_type != Run::RT_Aggregate ||
+             result.aggregate_unit == StatisticUnit::kTime) {
     const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
-            cpu_time_str.c_str(), timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(),
+            timeLabel, cpu_time_str.c_str(), timeLabel);
+  } else {
+    assert(result.aggregate_unit == StatisticUnit::kPercentage);
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ",
+            (100. * result.real_accumulated_time), "%",
+            (100. * result.cpu_accumulated_time), "%");
   }
 
   if (!result.report_big_o && !result.report_rms) {
@@ -153,12 +159,19 @@
   }
 
   for (auto& c : result.counters) {
-    const std::size_t cNameLen = std::max(std::string::size_type(10),
-                                          c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
+    const std::size_t cNameLen =
+        std::max(std::string::size_type(10), c.first.length());
+    std::string s;
     const char* unit = "";
-    if (c.second.flags & Counter::kIsRate)
-      unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    if (result.run_type == Run::RT_Aggregate &&
+        result.aggregate_unit == StatisticUnit::kPercentage) {
+      s = StrFormat("%.2f", 100. * c.second.value);
+      unit = "%";
+    } else {
+      s = HumanReadableNumber(c.second.value, c.second.oneK);
+      if (c.second.flags & Counter::kIsRate)
+        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    }
     if (output_options_ & OO_Tabular) {
       printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
               unit);
diff --git a/third-party/benchmark/src/csv_reporter.cc b/third-party/benchmark/src/csv_reporter.cc
--- a/third-party/benchmark/src/csv_reporter.cc
+++ b/third-party/benchmark/src/csv_reporter.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <iostream>
@@ -22,7 +19,9 @@
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"
 
@@ -37,13 +36,17 @@
     "error_occurred", "error_message"};
 }  // namespace
 
-std::string CsvEscape(const std::string & s) {
+std::string CsvEscape(const std::string& s) {
   std::string tmp;
   tmp.reserve(s.size() + 2);
   for (char c : s) {
     switch (c) {
-    case '"' : tmp += "\"\""; break;
-    default  : tmp += c; break;
+      case '"':
+        tmp += "\"\"";
+        break;
+      default:
+        tmp += c;
+        break;
     }
   }
   return '"' + tmp + '"';
@@ -85,7 +88,8 @@
       for (const auto& cnt : run.counters) {
         if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
           continue;
-        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+        BM_CHECK(user_counter_names_.find(cnt.first) !=
+                 user_counter_names_.end())
             << "All counters must be present in each run. "
             << "Counter named \"" << cnt.first
             << "\" was not in a run after being added to the header";
diff --git a/third-party/benchmark/src/cycleclock.h b/third-party/benchmark/src/cycleclock.h
--- a/third-party/benchmark/src/cycleclock.h
+++ b/third-party/benchmark/src/cycleclock.h
@@ -115,7 +115,7 @@
   // the code is being compiled with a non-ancient compiler.
   _asm rdtsc
 #elif defined(COMPILER_MSVC) && defined(_M_ARM64)
-  // See https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=vs-2019
+  // See // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
   // and https://reviews.llvm.org/D53115
   int64_t virtual_timer_value;
   virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
@@ -187,7 +187,7 @@
   asm("stck %0" : "=Q"(tsc) : : "cc");
 #endif
   return tsc;
-#elif defined(__riscv) // RISC-V
+#elif defined(__riscv)  // RISC-V
   // Use RDCYCLE (and RDCYCLEH on riscv32)
 #if __riscv_xlen == 32
   uint32_t cycles_lo, cycles_hi0, cycles_hi1;
diff --git a/third-party/benchmark/src/json_reporter.cc b/third-party/benchmark/src/json_reporter.cc
--- a/third-party/benchmark/src/json_reporter.cc
+++ b/third-party/benchmark/src/json_reporter.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -25,6 +22,8 @@
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"
 
@@ -35,34 +34,53 @@
 
 namespace {
 
-std::string StrEscape(const std::string & s) {
+std::string StrEscape(const std::string& s) {
   std::string tmp;
   tmp.reserve(s.size());
   for (char c : s) {
     switch (c) {
-    case '\b': tmp += "\\b"; break;
-    case '\f': tmp += "\\f"; break;
-    case '\n': tmp += "\\n"; break;
-    case '\r': tmp += "\\r"; break;
-    case '\t': tmp += "\\t"; break;
-    case '\\': tmp += "\\\\"; break;
-    case '"' : tmp += "\\\""; break;
-    default  : tmp += c; break;
+      case '\b':
+        tmp += "\\b";
+        break;
+      case '\f':
+        tmp += "\\f";
+        break;
+      case '\n':
+        tmp += "\\n";
+        break;
+      case '\r':
+        tmp += "\\r";
+        break;
+      case '\t':
+        tmp += "\\t";
+        break;
+      case '\\':
+        tmp += "\\\\";
+        break;
+      case '"':
+        tmp += "\\\"";
+        break;
+      default:
+        tmp += c;
+        break;
     }
   }
   return tmp;
 }
 
 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, const char* value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, bool value) {
-  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(),
+                   value ? "true" : "false");
 }
 
 std::string FormatKV(std::string const& key, int64_t value) {
@@ -126,7 +144,9 @@
                   RoundDouble(info.cycles_per_second / 1000000.0))
       << ",\n";
   if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
-    out << indent << FormatKV("cpu_scaling_enabled", info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+    out << indent
+        << FormatKV("cpu_scaling_enabled",
+                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
         << ",\n";
   }
 
@@ -139,8 +159,8 @@
     out << cache_indent << FormatKV("type", CI.type) << ",\n";
     out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
         << ",\n";
-    out << cache_indent
-        << FormatKV("size", static_cast<int64_t>(CI.size)) << ",\n";
+    out << cache_indent << FormatKV("size", static_cast<int64_t>(CI.size))
+        << ",\n";
     out << cache_indent
         << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
         << "\n";
@@ -162,13 +182,15 @@
 #else
   const char build_type[] = "debug";
 #endif
-  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  out << indent << FormatKV("library_build_type", build_type);
 
   if (internal::global_context != nullptr) {
-    for (const auto& kv: *internal::global_context) {
-      out << indent << FormatKV(kv.first, kv.second) << "\n";
+    for (const auto& kv : *internal::global_context) {
+      out << ",\n";
+      out << indent << FormatKV(kv.first, kv.second);
     }
   }
+  out << "\n";
 
   // Close context block and open the list of benchmarks.
   out << inner_indent << "},\n";
@@ -229,6 +251,15 @@
   out << indent << FormatKV("threads", run.threads) << ",\n";
   if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
     out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+    out << indent << FormatKV("aggregate_unit", [&run]() -> const char* {
+      switch (run.aggregate_unit) {
+        case StatisticUnit::kTime:
+          return "time";
+        case StatisticUnit::kPercentage:
+          return "percentage";
+      }
+      BENCHMARK_UNREACHABLE();
+    }()) << ",\n";
   }
   if (run.error_occurred) {
     out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
@@ -236,8 +267,17 @@
   }
   if (!run.report_big_o && !run.report_rms) {
     out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
-    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    if (run.run_type != Run::RT_Aggregate ||
+        run.aggregate_unit == StatisticUnit::kTime) {
+      out << indent << FormatKV("real_time", run.GetAdjustedRealTime())
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    } else {
+      assert(run.aggregate_unit == StatisticUnit::kPercentage);
+      out << indent << FormatKV("real_time", run.real_accumulated_time)
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.cpu_accumulated_time);
+    }
     out << ",\n"
         << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_big_o) {
@@ -255,9 +295,20 @@
     out << ",\n" << indent << FormatKV(c.first, c.second);
   }
 
-  if (run.has_memory_result) {
+  if (run.memory_result) {
+    const MemoryManager::Result memory_result = *run.memory_result;
     out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
-    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+    out << ",\n"
+        << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);
+
+    auto report_if_present = [&out, &indent](const char* label, int64_t val) {
+      if (val != MemoryManager::TombstoneValue)
+        out << ",\n" << indent << FormatKV(label, val);
+    };
+
+    report_if_present("total_allocated_bytes",
+                      memory_result.total_allocated_bytes);
+    report_if_present("net_heap_growth", memory_result.net_heap_growth);
   }
 
   if (!run.report_label.empty()) {
@@ -266,4 +317,7 @@
   out << '\n';
 }
 
+const int64_t MemoryManager::TombstoneValue =
+    std::numeric_limits<int64_t>::max();
+
 }  // end namespace benchmark
diff --git a/third-party/benchmark/src/log.h b/third-party/benchmark/src/log.h
--- a/third-party/benchmark/src/log.h
+++ b/third-party/benchmark/src/log.h
@@ -67,7 +67,7 @@
 }  // end namespace benchmark
 
 // clang-format off
-#define VLOG(x)                                                               \
+#define BM_VLOG(x)                                                               \
   (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                          " ")
 // clang-format on
diff --git a/third-party/benchmark/src/mutex.h b/third-party/benchmark/src/mutex.h
--- a/third-party/benchmark/src/mutex.h
+++ b/third-party/benchmark/src/mutex.h
@@ -130,7 +130,7 @@
   // entered the barrier.  Returns iff this is the last thread to
   // enter the barrier.
   bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
+    BM_CHECK_LT(entered_, running_threads_);
     entered_++;
     if (entered_ < running_threads_) {
       // Wait for all threads to enter
diff --git a/third-party/benchmark/src/perf_counters.h b/third-party/benchmark/src/perf_counters.h
--- a/third-party/benchmark/src/perf_counters.h
+++ b/third-party/benchmark/src/perf_counters.h
@@ -42,7 +42,7 @@
 class PerfCounterValues {
  public:
   explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
-    CHECK_LE(nr_counters_, kMaxCounters);
+    BM_CHECK_LE(nr_counters_, kMaxCounters);
   }
 
   uint64_t operator[](size_t pos) const { return values_[kPadding + pos]; }
diff --git a/third-party/benchmark/src/perf_counters.cc b/third-party/benchmark/src/perf_counters.cc
--- a/third-party/benchmark/src/perf_counters.cc
+++ b/third-party/benchmark/src/perf_counters.cc
@@ -49,7 +49,7 @@
   const int mode = PFM_PLM3;  // user mode only
   for (size_t i = 0; i < counter_names.size(); ++i) {
     const bool is_first = i == 0;
-    struct perf_event_attr attr{};
+    struct perf_event_attr attr {};
     attr.size = sizeof(attr);
     const int group_id = !is_first ? counter_ids[0] : -1;
     const auto& name = counter_names[i];
diff --git a/third-party/benchmark/src/re.h b/third-party/benchmark/src/re.h
--- a/third-party/benchmark/src/re.h
+++ b/third-party/benchmark/src/re.h
@@ -126,7 +126,7 @@
 
       // regerror returns the number of bytes necessary to null terminate
       // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
+      BM_CHECK_NE(needed, 0);
       error->assign(errbuf, needed - 1);
 
       delete[] errbuf;
diff --git a/third-party/benchmark/src/reporter.cc b/third-party/benchmark/src/reporter.cc
--- a/third-party/benchmark/src/reporter.cc
+++ b/third-party/benchmark/src/reporter.cc
@@ -12,23 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "timers.h"
-
 #include <cstdlib>
-
 #include <iostream>
 #include <map>
 #include <string>
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "string_util.h"
+#include "timers.h"
 
 namespace benchmark {
 namespace internal {
-extern std::map<std::string, std::string>* global_context;
+extern std::map<std::string, std::string> *global_context;
 }
 
 BenchmarkReporter::BenchmarkReporter()
@@ -38,7 +36,7 @@
 
 void BenchmarkReporter::PrintBasicContext(std::ostream *out,
                                           Context const &context) {
-  CHECK(out) << "cannot be null";
+  BM_CHECK(out) << "cannot be null";
   auto &Out = *out;
 
   Out << LocalDateTimeString() << "\n";
@@ -70,7 +68,7 @@
   }
 
   if (internal::global_context != nullptr) {
-    for (const auto& kv: *internal::global_context) {
+    for (const auto &kv : *internal::global_context) {
       Out << kv.first << ": " << kv.second << "\n";
     }
   }
diff --git a/third-party/benchmark/src/sleep.cc b/third-party/benchmark/src/sleep.cc
--- a/third-party/benchmark/src/sleep.cc
+++ b/third-party/benchmark/src/sleep.cc
@@ -35,7 +35,7 @@
 void SleepForSeconds(double seconds) {
   SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
 }
-#else   // BENCHMARK_OS_WINDOWS
+#else  // BENCHMARK_OS_WINDOWS
 void SleepForMicroseconds(int microseconds) {
 #ifdef BENCHMARK_OS_ZOS
   // z/OS does not support nanosleep. Instead call sleep() and then usleep() to
@@ -43,8 +43,7 @@
   // argument is greater than 1000000.
   div_t sleepTime = div(microseconds, kNumMicrosPerSecond);
   int seconds = sleepTime.quot;
-  while (seconds != 0)
-    seconds = sleep(seconds);
+  while (seconds != 0) seconds = sleep(seconds);
   while (usleep(sleepTime.rem) == -1 && errno == EINTR)
     ;
 #else
diff --git a/third-party/benchmark/src/statistics.h b/third-party/benchmark/src/statistics.h
--- a/third-party/benchmark/src/statistics.h
+++ b/third-party/benchmark/src/statistics.h
@@ -31,6 +31,7 @@
 double StatisticsMean(const std::vector<double>& v);
 double StatisticsMedian(const std::vector<double>& v);
 double StatisticsStdDev(const std::vector<double>& v);
+double StatisticsCV(const std::vector<double>& v);
 
 }  // end namespace benchmark
 
diff --git a/third-party/benchmark/src/statistics.cc b/third-party/benchmark/src/statistics.cc
--- a/third-party/benchmark/src/statistics.cc
+++ b/third-party/benchmark/src/statistics.cc
@@ -13,15 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
+#include "statistics.h"
 
 #include <algorithm>
 #include <cmath>
 #include <numeric>
 #include <string>
 #include <vector>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "statistics.h"
 
 namespace benchmark {
 
@@ -74,6 +75,15 @@
   return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
 }
 
+double StatisticsCV(const std::vector<double>& v) {
+  if (v.size() < 2) return 0.0;
+
+  const auto stddev = StatisticsStdDev(v);
+  const auto mean = StatisticsMean(v);
+
+  return stddev / mean;
+}
+
 std::vector<BenchmarkReporter::Run> ComputeStats(
     const std::vector<BenchmarkReporter::Run>& reports) {
   typedef BenchmarkReporter::Run Run;
@@ -112,22 +122,22 @@
         it = counter_stats.find(cnt.first);
         it->second.s.reserve(reports.size());
       } else {
-        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+        BM_CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
       }
     }
   }
 
   // Populate the accumulators.
   for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
-    CHECK_EQ(run_iterations, run.iterations);
+    BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
+    BM_CHECK_EQ(run_iterations, run.iterations);
     if (run.error_occurred) continue;
     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
     // user counters
     for (auto const& cnt : run.counters) {
       auto it = counter_stats.find(cnt.first);
-      CHECK_NE(it, counter_stats.end());
+      BM_CHECK_NE(it, counter_stats.end());
       it->second.s.emplace_back(cnt.second);
     }
   }
@@ -155,6 +165,7 @@
     data.repetitions = reports[0].repetitions;
     data.repetition_index = Run::no_repetition_index;
     data.aggregate_name = Stat.name_;
+    data.aggregate_unit = Stat.unit_;
     data.report_label = report_label;
 
     // It is incorrect to say that an aggregate is computed over
@@ -167,13 +178,15 @@
     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
 
-    // We will divide these times by data.iterations when reporting, but the
-    // data.iterations is not nessesairly the scale of these measurements,
-    // because in each repetition, these timers are sum over all the iterations.
-    // And if we want to say that the stats are over N repetitions and not
-    // M iterations, we need to multiply these by (N/M).
-    data.real_accumulated_time *= iteration_rescale_factor;
-    data.cpu_accumulated_time *= iteration_rescale_factor;
+    if (data.aggregate_unit == StatisticUnit::kTime) {
+      // We will divide these times by data.iterations when reporting, but the
+      // data.iterations is not necessarily the scale of these measurements,
+      // because in each repetition, these timers are sum over all the iters.
+      // And if we want to say that the stats are over N repetitions and not
+      // M iterations, we need to multiply these by (N/M).
+      data.real_accumulated_time *= iteration_rescale_factor;
+      data.cpu_accumulated_time *= iteration_rescale_factor;
+    }
 
     data.time_unit = reports[0].time_unit;
 
diff --git a/third-party/benchmark/src/string_util.h b/third-party/benchmark/src/string_util.h
--- a/third-party/benchmark/src/string_util.h
+++ b/third-party/benchmark/src/string_util.h
@@ -4,6 +4,7 @@
 #include <sstream>
 #include <string>
 #include <utility>
+
 #include "internal_macros.h"
 
 namespace benchmark {
@@ -39,6 +40,8 @@
 
 std::vector<std::string> StrSplit(const std::string& str, char delim);
 
+// Disable lint checking for this block since it re-implements C functions.
+// NOLINTBEGIN
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
  * GNU STL in Android NDK lacks support for some C++11 functions, including
@@ -47,14 +50,15 @@
  * namespace, not std:: namespace.
  */
 unsigned long stoul(const std::string& str, size_t* pos = nullptr,
-                           int base = 10);
+                    int base = 10);
 int stoi(const std::string& str, size_t* pos = nullptr, int base = 10);
 double stod(const std::string& str, size_t* pos = nullptr);
 #else
-using std::stoul;
-using std::stoi;
-using std::stod;
+using std::stod;   // NOLINT(misc-unused-using-decls)
+using std::stoi;   // NOLINT(misc-unused-using-decls)
+using std::stoul;  // NOLINT(misc-unused-using-decls)
 #endif
+// NOLINTEND
 
 }  // end namespace benchmark
 
diff --git a/third-party/benchmark/src/string_util.cc b/third-party/benchmark/src/string_util.cc
--- a/third-party/benchmark/src/string_util.cc
+++ b/third-party/benchmark/src/string_util.cc
@@ -151,7 +151,7 @@
   auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
-  ret = vsnprintf(buff_ptr.get(), size, msg, args);
+  vsnprintf(buff_ptr.get(), size, msg, args);
   return std::string(buff_ptr.get());
 }
 
@@ -198,11 +198,10 @@
 
   /* Check for errors and return */
   if (strtoulErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of unsigned long");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of unsigned long");
   } else if (strEnd == strStart || strtoulErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
@@ -225,11 +224,10 @@
 
   /* Check for errors and return */
   if (strtolErrno == ERANGE || long(int(result)) != result) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
   } else if (strEnd == strStart || strtolErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
@@ -252,11 +250,10 @@
 
   /* Check for errors and return */
   if (strtodErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
   } else if (strEnd == strStart || strtodErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc
--- a/third-party/benchmark/src/sysinfo.cc
+++ b/third-party/benchmark/src/sysinfo.cc
@@ -19,6 +19,7 @@
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
 #include <versionhelpers.h>
 #include <windows.h>
+
 #include <codecvt>
 #else
 #include <fcntl.h>
@@ -55,9 +56,9 @@
 #include <iostream>
 #include <iterator>
 #include <limits>
+#include <locale>
 #include <memory>
 #include <sstream>
-#include <locale>
 #include <utility>
 
 #include "check.h"
@@ -135,7 +136,7 @@
   template <class T, int N>
   std::array<T, N> GetAsArray() {
     const int ArrSize = sizeof(T) * N;
-    CHECK_LE(ArrSize, Size);
+    BM_CHECK_LE(ArrSize, Size);
     std::array<T, N> Arr;
     std::memcpy(Arr.data(), data(), ArrSize);
     return Arr;
@@ -147,7 +148,7 @@
   int mib[2];
 
   mib[0] = CTL_HW;
-  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")){
+  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")) {
     ValueUnion buff(sizeof(int));
 
     if (Name == "hw.ncpu") {
@@ -214,10 +215,9 @@
 CPUInfo::Scaling CpuScaling(int num_cpus) {
   // We don't have a valid CPU count, so don't even bother.
   if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
-#ifdef BENCHMARK_OS_QNX
+#if defined(BENCHMARK_OS_QNX)
   return CPUInfo::Scaling::UNKNOWN;
-#endif
-#ifndef BENCHMARK_OS_WINDOWS
+#elif !defined(BENCHMARK_OS_WINDOWS)
   // On Linux, the CPUfreq subsystem exposes CPU information as files on the
   // local file system. If reading the exported files fails, then we may not be
   // running on Linux, so we silently ignore all the read errors.
@@ -225,11 +225,13 @@
   for (int cpu = 0; cpu < num_cpus; ++cpu) {
     std::string governor_file =
         StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    if (ReadFromFile(governor_file, &res) && res != "performance") return CPUInfo::Scaling::ENABLED;
+    if (ReadFromFile(governor_file, &res) && res != "performance")
+      return CPUInfo::Scaling::ENABLED;
   }
   return CPUInfo::Scaling::DISABLED;
-#endif
+#else
   return CPUInfo::Scaling::UNKNOWN;
+#endif
 }
 
 int CountSetBitsInCPUMap(std::string Val) {
@@ -366,29 +368,29 @@
 #elif BENCHMARK_OS_QNX
 std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
   std::vector<CPUInfo::CacheInfo> res;
-  struct cacheattr_entry *cache = SYSPAGE_ENTRY(cacheattr);
+  struct cacheattr_entry* cache = SYSPAGE_ENTRY(cacheattr);
   uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
-  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize ;
-  for(int i = 0; i < num; ++i ) {
+  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize;
+  for (int i = 0; i < num; ++i) {
     CPUInfo::CacheInfo info;
-    switch (cache->flags){
-      case CACHE_FLAG_INSTR :
+    switch (cache->flags) {
+      case CACHE_FLAG_INSTR:
         info.type = "Instruction";
         info.level = 1;
         break;
-      case CACHE_FLAG_DATA :
+      case CACHE_FLAG_DATA:
         info.type = "Data";
         info.level = 1;
         break;
-      case CACHE_FLAG_UNIFIED :
+      case CACHE_FLAG_UNIFIED:
         info.type = "Unified";
         info.level = 2;
         break;
-      case CACHE_FLAG_SHARED :
+      case CACHE_FLAG_SHARED:
         info.type = "Shared";
         info.level = 3;
         break;
-      default :
+      default:
         continue;
         break;
     }
@@ -416,24 +418,23 @@
 std::string GetSystemName() {
 #if defined(BENCHMARK_OS_WINDOWS)
   std::string str;
-  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH+1;
-  TCHAR  hostname[COUNT] = {'\0'};
+  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH + 1;
+  TCHAR hostname[COUNT] = {'\0'};
   DWORD DWCOUNT = COUNT;
-  if (!GetComputerName(hostname, &DWCOUNT))
-    return std::string("");
+  if (!GetComputerName(hostname, &DWCOUNT)) return std::string("");
 #ifndef UNICODE
   str = std::string(hostname, DWCOUNT);
 #else
-  //Using wstring_convert, Is deprecated in C++17
+  // Using wstring_convert, Is deprecated in C++17
   using convert_type = std::codecvt_utf8<wchar_t>;
   std::wstring_convert<convert_type, wchar_t> converter;
   std::wstring wStr(hostname, DWCOUNT);
   str = converter.to_bytes(wStr);
 #endif
   return str;
-#else // defined(BENCHMARK_OS_WINDOWS)
+#else  // defined(BENCHMARK_OS_WINDOWS)
 #ifndef HOST_NAME_MAX
-#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined
+#ifdef BENCHMARK_HAS_SYSCTL  // BSD/Mac Doesnt have HOST_NAME_MAX defined
 #define HOST_NAME_MAX 64
 #elif defined(BENCHMARK_OS_NACL)
 #define HOST_NAME_MAX 64
@@ -442,15 +443,15 @@
 #elif defined(BENCHMARK_OS_RTEMS)
 #define HOST_NAME_MAX 256
 #else
-#warning "HOST_NAME_MAX not defined. using 64"
+#pragma message("HOST_NAME_MAX not defined. using 64")
 #define HOST_NAME_MAX 64
 #endif
-#endif // def HOST_NAME_MAX
+#endif  // def HOST_NAME_MAX
   char hostname[HOST_NAME_MAX];
   int retVal = gethostname(hostname, HOST_NAME_MAX);
   if (retVal != 0) return std::string("");
   return std::string(hostname);
-#endif // Catch-all POSIX block.
+#endif  // Catch-all POSIX block.
 }
 
 int GetNumCPUs() {
@@ -472,8 +473,7 @@
   // Returns -1 in case of a failure.
   int NumCPU = sysconf(_SC_NPROCESSORS_ONLN);
   if (NumCPU < 0) {
-    fprintf(stderr,
-            "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
+    fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
             strerror(errno));
   }
   return NumCPU;
@@ -496,7 +496,8 @@
 #if defined(__s390__)
     // s390 has another format in /proc/cpuinfo
     // it needs to be parsed differently
-    if (SplitIdx != std::string::npos) value = ln.substr(Key.size()+1,SplitIdx-Key.size()-1);
+    if (SplitIdx != std::string::npos)
+      value = ln.substr(Key.size() + 1, SplitIdx - Key.size() - 1);
 #else
     if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
 #endif
@@ -543,7 +544,7 @@
   // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
   // well.
   if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
-      // If CPU scaling is disabled, use the the *current* frequency.
+      // If CPU scaling is disabled, use the *current* frequency.
       // Note that we specifically don't want to read cpuinfo_cur_freq,
       // because it is only readable by root.
       || (scaling == CPUInfo::Scaling::DISABLED &&
@@ -642,13 +643,13 @@
                       "~MHz", nullptr, &data, &data_size)))
     return static_cast<double>((int64_t)data *
                                (int64_t)(1000 * 1000));  // was mhz
-#elif defined (BENCHMARK_OS_SOLARIS)
-  kstat_ctl_t *kc = kstat_open();
+#elif defined(BENCHMARK_OS_SOLARIS)
+  kstat_ctl_t* kc = kstat_open();
   if (!kc) {
     std::cerr << "failed to open /dev/kstat\n";
     return -1;
   }
-  kstat_t *ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
+  kstat_t* ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
   if (!ksp) {
     std::cerr << "failed to lookup in /dev/kstat\n";
     return -1;
@@ -657,7 +658,7 @@
     std::cerr << "failed to read from /dev/kstat\n";
     return -1;
   }
-  kstat_named_t *knp =
+  kstat_named_t* knp =
       (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz");
   if (!knp) {
     std::cerr << "failed to lookup data in /dev/kstat\n";
@@ -671,7 +672,7 @@
   double clock_hz = knp->value.ui64;
   kstat_close(kc);
   return clock_hz;
-#elif defined (BENCHMARK_OS_QNX)
+#elif defined(BENCHMARK_OS_QNX)
   return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
                              (int64_t)(1000 * 1000));
 #endif
diff --git a/third-party/benchmark/src/thread_manager.h b/third-party/benchmark/src/thread_manager.h
--- a/third-party/benchmark/src/thread_manager.h
+++ b/third-party/benchmark/src/thread_manager.h
@@ -36,7 +36,6 @@
                         [this]() { return alive_threads_ == 0; });
   }
 
- public:
   struct Result {
     IterationCount iterations = 0;
     double real_time_used = 0;
diff --git a/third-party/benchmark/src/thread_timer.h b/third-party/benchmark/src/thread_timer.h
--- a/third-party/benchmark/src/thread_timer.h
+++ b/third-party/benchmark/src/thread_timer.h
@@ -28,7 +28,7 @@
 
   // Called by each thread
   void StopTimer() {
-    CHECK(running_);
+    BM_CHECK(running_);
     running_ = false;
     real_time_used_ += ChronoClockNow() - start_real_time_;
     // Floating point error can result in the subtraction producing a negative
@@ -44,19 +44,19 @@
 
   // REQUIRES: timer is not running
   double real_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
     return real_time_used_;
   }
 
   // REQUIRES: timer is not running
   double cpu_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
     return cpu_time_used_;
   }
 
   // REQUIRES: timer is not running
   double manual_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
     return manual_time_used_;
   }
 
diff --git a/third-party/benchmark/src/timers.cc b/third-party/benchmark/src/timers.cc
--- a/third-party/benchmark/src/timers.cc
+++ b/third-party/benchmark/src/timers.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "timers.h"
+
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
@@ -125,8 +126,8 @@
   // syncronous system calls in Emscripten.
   return emscripten_get_now() * 1e-3;
 #elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
   struct timespec spec;
   if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
     return MakeTime(spec);
@@ -149,13 +150,14 @@
                  &user_time);
   return MakeTime(kernel_time, user_time);
 #elif defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
   mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
   thread_basic_info_data_t info;
   mach_port_t thread = pthread_mach_thread_np(pthread_self());
-  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) ==
-      KERN_SUCCESS) {
+  if (thread_info(thread, THREAD_BASIC_INFO,
+                  reinterpret_cast<thread_info_t>(&info),
+                  &count) == KERN_SUCCESS) {
     return MakeTime(info);
   }
   DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
@@ -191,11 +193,14 @@
   long int offset_minutes;
   char tz_offset_sign = '+';
   // tz_offset is set in one of three ways:
-  // * strftime with %z - This either returns empty or the ISO 8601 time.  The maximum length an
+  // * strftime with %z - This either returns empty or the ISO 8601 time.  The
+  // maximum length an
   //   ISO 8601 string can be is 7 (e.g. -03:30, plus trailing zero).
-  // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to 19 for %02li,
+  // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to
+  // 19 for %02li,
   //   one for :, up to 19 %02li, plus trailing zero).
-  // * A fixed string of "-00:00".  The maximum length is 7 (-00:00, plus trailing zero).
+  // * A fixed string of "-00:00".  The maximum length is 7 (-00:00, plus
+  // trailing zero).
   //
   // Thus, the maximum size this needs to be is 41.
   char tz_offset[41];
@@ -203,10 +208,10 @@
   char storage[128];
 
 #if defined(BENCHMARK_OS_WINDOWS)
-  std::tm *timeinfo_p = ::localtime(&now);
+  std::tm* timeinfo_p = ::localtime(&now);
 #else
   std::tm timeinfo;
-  std::tm *timeinfo_p = &timeinfo;
+  std::tm* timeinfo_p = &timeinfo;
   ::localtime_r(&now, &timeinfo);
 #endif
 
@@ -223,10 +228,11 @@
       tz_offset_sign = '-';
     }
 
-    tz_len = ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
-        tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
-    CHECK(tz_len == kTzOffsetLen);
-    ((void)tz_len); // Prevent unused variable warning in optimized build.
+    tz_len =
+        ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
+                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+    BM_CHECK(tz_len == kTzOffsetLen);
+    ((void)tz_len);  // Prevent unused variable warning in optimized build.
   } else {
     // Unknown offset. RFC3339 specifies that unknown local offsets should be
     // written as UTC time with -00:00 timezone.
@@ -240,9 +246,9 @@
     strncpy(tz_offset, "-00:00", kTzOffsetLen + 1);
   }
 
-  timestamp_len = std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S",
-      timeinfo_p);
-  CHECK(timestamp_len == kTimestampLen);
+  timestamp_len =
+      std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S", timeinfo_p);
+  BM_CHECK(timestamp_len == kTimestampLen);
   // Prevent unused variable warning in optimized build.
   ((void)kTimestampLen);
 
diff --git a/third-party/benchmark/test/BUILD b/third-party/benchmark/test/BUILD
--- a/third-party/benchmark/test/BUILD
+++ b/third-party/benchmark/test/BUILD
@@ -21,6 +21,7 @@
 PER_SRC_TEST_ARGS = ({
     "user_counters_tabular_test.cc": ["--benchmark_counters_tabular=true"],
     "repetitions_test.cc": [" --benchmark_repetitions=3"],
+    "spec_arg_test.cc" : ["--benchmark_filter=BM_NotChosen"],
 })
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
diff --git a/third-party/benchmark/test/CMakeLists.txt b/third-party/benchmark/test/CMakeLists.txt
--- a/third-party/benchmark/test/CMakeLists.txt
+++ b/third-party/benchmark/test/CMakeLists.txt
@@ -56,6 +56,12 @@
 compile_benchmark_test(benchmark_test)
 add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01)
 
+compile_benchmark_test(spec_arg_test)
+add_test(NAME spec_arg COMMAND spec_arg_test --benchmark_filter=BM_NotChosen)
+
+compile_benchmark_test(benchmark_setup_teardown_test)
+add_test(NAME benchmark_setup_teardown COMMAND benchmark_setup_teardown_test)
+
 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
   add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
diff --git a/third-party/benchmark/test/args_product_test.cc b/third-party/benchmark/test/args_product_test.cc
--- a/third-party/benchmark/test/args_product_test.cc
+++ b/third-party/benchmark/test/args_product_test.cc
@@ -1,10 +1,10 @@
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <iostream>
 #include <set>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+
 class ArgsProductFixture : public ::benchmark::Fixture {
  public:
   ArgsProductFixture()
@@ -37,7 +37,7 @@
   virtual ~ArgsProductFixture() {
     if (actualValues != expectedValues) {
       std::cout << "EXPECTED\n";
-      for (auto v : expectedValues) {
+      for (const auto& v : expectedValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
@@ -45,7 +45,7 @@
         std::cout << "}\n";
       }
       std::cout << "ACTUAL\n";
-      for (auto v : actualValues) {
+      for (const auto& v : actualValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
diff --git a/third-party/benchmark/test/basic_test.cc b/third-party/benchmark/test/basic_test.cc
--- a/third-party/benchmark/test/basic_test.cc
+++ b/third-party/benchmark/test/basic_test.cc
@@ -13,7 +13,7 @@
 
 void BM_spin_empty(benchmark::State& state) {
   for (auto _ : state) {
-    for (int x = 0; x < state.range(0); ++x) {
+    for (auto x = 0; x < state.range(0); ++x) {
       benchmark::DoNotOptimize(x);
     }
   }
@@ -22,11 +22,11 @@
 BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
 
 void BM_spin_pause_before(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
   for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
@@ -37,11 +37,11 @@
 void BM_spin_pause_during(benchmark::State& state) {
   for (auto _ : state) {
     state.PauseTiming();
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
     state.ResumeTiming();
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
@@ -62,11 +62,11 @@
 
 void BM_spin_pause_after(benchmark::State& state) {
   for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
 }
@@ -74,15 +74,15 @@
 BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
 
 void BM_spin_pause_before_and_after(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
   for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
 }
@@ -96,7 +96,6 @@
 BENCHMARK(BM_empty_stop_start);
 BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
 
-
 void BM_KeepRunning(benchmark::State& state) {
   benchmark::IterationCount iter_count = 0;
   assert(iter_count == state.iterations());
@@ -142,10 +141,39 @@
 }
 BENCHMARK(BM_RangedFor);
 
+#ifdef BENCHMARK_HAS_CXX11
+template <typename T>
+void BM_OneTemplateFunc(benchmark::State& state) {
+  auto arg = state.range(0);
+  T sum = 0;
+  for (auto _ : state) {
+    sum += arg;
+  }
+}
+BENCHMARK(BM_OneTemplateFunc<int>)->Arg(1);
+BENCHMARK(BM_OneTemplateFunc<double>)->Arg(1);
+
+template <typename A, typename B>
+void BM_TwoTemplateFunc(benchmark::State& state) {
+  auto arg = state.range(0);
+  A sum = 0;
+  B prod = 1;
+  for (auto _ : state) {
+    sum += arg;
+    prod *= arg;
+  }
+}
+BENCHMARK(BM_TwoTemplateFunc<int, double>)->Arg(1);
+BENCHMARK(BM_TwoTemplateFunc<double, int>)->Arg(1);
+
+#endif  // BENCHMARK_HAS_CXX11
+
 // Ensure that StateIterator provides all the necessary typedefs required to
 // instantiate std::iterator_traits.
-static_assert(std::is_same<
-  typename std::iterator_traits<benchmark::State::StateIterator>::value_type,
-  typename benchmark::State::StateIterator::value_type>::value, "");
+static_assert(
+    std::is_same<typename std::iterator_traits<
+                     benchmark::State::StateIterator>::value_type,
+                 typename benchmark::State::StateIterator::value_type>::value,
+    "");
 
 BENCHMARK_MAIN();
diff --git a/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc b/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc
--- a/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc
+++ b/third-party/benchmark/test/benchmark_random_interleaving_gtest.cc
@@ -8,11 +8,12 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-DECLARE_bool(benchmark_enable_random_interleaving);
-DECLARE_string(benchmark_filter);
-DECLARE_int32(benchmark_repetitions);
-
 namespace benchmark {
+
+BM_DECLARE_bool(benchmark_enable_random_interleaving);
+BM_DECLARE_string(benchmark_filter);
+BM_DECLARE_int32(benchmark_repetitions);
+
 namespace internal {
 namespace {
 
@@ -33,7 +34,7 @@
   }
 };
 
-static EventQueue* queue = new EventQueue;
+EventQueue* queue = new EventQueue();
 
 class NullReporter : public BenchmarkReporter {
  public:
@@ -59,7 +60,7 @@
   }
 };
 
-static void BM_Match1(benchmark::State& state) {
+void BM_Match1(benchmark::State& state) {
   const int64_t arg = state.range(0);
 
   for (auto _ : state) {
@@ -110,8 +111,8 @@
     std::vector<std::string> interleaving;
     interleaving.push_back(queue->Get());
     interleaving.push_back(queue->Get());
-    element_count[interleaving[0].c_str()]++;
-    element_count[interleaving[1].c_str()]++;
+    element_count[interleaving[0]]++;
+    element_count[interleaving[1]]++;
     interleaving_count[StrFormat("%s,%s", interleaving[0].c_str(),
                                  interleaving[1].c_str())]++;
   }
diff --git a/third-party/benchmark/test/benchmark_setup_teardown_test.cc b/third-party/benchmark/test/benchmark_setup_teardown_test.cc
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/test/benchmark_setup_teardown_test.cc
@@ -0,0 +1,157 @@
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+// Test that Setup() and Teardown() are called exactly once
+// for each benchmark run (single-threaded).
+namespace single {
+static int setup_call = 0;
+static int teardown_call = 0;
+}  // namespace single
+static void DoSetup1(const benchmark::State& state) {
+  ++single::setup_call;
+
+  // Setup/Teardown should never be called with any thread_idx != 0.
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown1(const benchmark::State& state) {
+  ++single::teardown_call;
+  assert(state.thread_index() == 0);
+}
+
+static void BM_with_setup(benchmark::State& state) {
+  for (auto s : state) {
+  }
+}
+BENCHMARK(BM_with_setup)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Iterations(100)
+    ->Setup(DoSetup1)
+    ->Teardown(DoTeardown1);
+
+// Test that Setup() and Teardown() are called once for each group of threads.
+namespace concurrent {
+static std::atomic<int> setup_call(0);
+static std::atomic<int> teardown_call(0);
+static std::atomic<int> func_call(0);
+}  // namespace concurrent
+
+static void DoSetup2(const benchmark::State& state) {
+  concurrent::setup_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown2(const benchmark::State& state) {
+  concurrent::teardown_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void BM_concurrent(benchmark::State& state) {
+  for (auto s : state) {
+  }
+  concurrent::func_call.fetch_add(1, std::memory_order_acquire);
+}
+
+BENCHMARK(BM_concurrent)
+    ->Setup(DoSetup2)
+    ->Teardown(DoTeardown2)
+    ->Iterations(100)
+    ->Threads(5)
+    ->Threads(10)
+    ->Threads(15);
+
+// Testing interaction with Fixture::Setup/Teardown
+namespace fixture_interaction {
+int setup = 0;
+int fixture_setup = 0;
+}  // namespace fixture_interaction
+
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State&) BENCHMARK_OVERRIDE {
+    fixture_interaction::fixture_setup++;
+  }
+
+  ~FIXTURE_BECHMARK_NAME() {}
+};
+
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)(benchmark::State& st) {
+  for (auto _ : st) {
+  }
+}
+
+static void DoSetupWithFixture(const benchmark::State&) {
+  fixture_interaction::setup++;
+}
+
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithFixture)
+    ->Repetitions(1)
+    ->Iterations(100);
+
+// Testing repetitions.
+namespace repetitions {
+int setup = 0;
+}
+
+static void DoSetupWithRepetitions(const benchmark::State&) {
+  repetitions::setup++;
+}
+static void BM_WithRep(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+
+BENCHMARK(BM_WithRep)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithRepetitions)
+    ->Iterations(100)
+    ->Repetitions(4);
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+
+  size_t ret = benchmark::RunSpecifiedBenchmarks(".");
+  assert(ret > 0);
+
+  // Setup/Teardown is called once for each arg group (1,3,5,7).
+  assert(single::setup_call == 4);
+  assert(single::teardown_call == 4);
+
+  // 3 group of threads calling this function (3,5,10).
+  assert(concurrent::setup_call.load(std::memory_order_relaxed) == 3);
+  assert(concurrent::teardown_call.load(std::memory_order_relaxed) == 3);
+  assert((5 + 10 + 15) ==
+         concurrent::func_call.load(std::memory_order_relaxed));
+
+  // Setup is called 4 times, once for each arg group (1,3,5,7)
+  assert(fixture_interaction::setup == 4);
+  // Fixture::Setup is called everytime the bm routine is run.
+  // The exact number is indeterministic, so we just assert that
+  // it's more than setup.
+  assert(fixture_interaction::fixture_setup > fixture_interaction::setup);
+
+  // Setup is call once for each repetition * num_arg =  4 * 4 = 16.
+  assert(repetitions::setup == 16);
+
+  return 0;
+}
diff --git a/third-party/benchmark/test/benchmark_test.cc b/third-party/benchmark/test/benchmark_test.cc
--- a/third-party/benchmark/test/benchmark_test.cc
+++ b/third-party/benchmark/test/benchmark_test.cc
@@ -93,8 +93,9 @@
   state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
 }
 
-// Test many inserts at once to reduce the total iterations needed. Otherwise, the slower,
-// non-timed part of each iteration will make the benchmark take forever.
+// Test many inserts at once to reduce the total iterations needed. Otherwise,
+// the slower, non-timed part of each iteration will make the benchmark take
+// forever.
 BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {128, 512}});
 
 template <typename Container,
@@ -126,7 +127,7 @@
 BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
 
 static void BM_SetupTeardown(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // No need to lock test_vector_mu here as this is running single-threaded.
     test_vector = new std::vector<int>();
   }
@@ -139,7 +140,7 @@
       test_vector->pop_back();
     ++i;
   }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     delete test_vector;
   }
 }
@@ -156,11 +157,11 @@
 
 static void BM_ParallelMemset(benchmark::State& state) {
   int64_t size = state.range(0) / static_cast<int64_t>(sizeof(int));
-  int thread_size = static_cast<int>(size) / state.threads;
-  int from = thread_size * state.thread_index;
+  int thread_size = static_cast<int>(size) / state.threads();
+  int from = thread_size * state.thread_index();
   int to = from + thread_size;
 
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     test_vector = new std::vector<int>(static_cast<size_t>(size));
   }
 
@@ -172,7 +173,7 @@
     }
   }
 
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     delete test_vector;
   }
 }
@@ -214,7 +215,8 @@
                   std::pair<int, double>(42, 3.8));
 
 void BM_non_template_args(benchmark::State& state, int, double) {
-  while(state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 
@@ -223,14 +225,14 @@
 static void BM_DenseThreadRanges(benchmark::State& st) {
   switch (st.range(0)) {
     case 1:
-      assert(st.threads == 1 || st.threads == 2 || st.threads == 3);
+      assert(st.threads() == 1 || st.threads() == 2 || st.threads() == 3);
       break;
     case 2:
-      assert(st.threads == 1 || st.threads == 3 || st.threads == 4);
+      assert(st.threads() == 1 || st.threads() == 3 || st.threads() == 4);
       break;
     case 3:
-      assert(st.threads == 5 || st.threads == 8 || st.threads == 11 ||
-             st.threads == 14);
+      assert(st.threads() == 5 || st.threads() == 8 || st.threads() == 11 ||
+             st.threads() == 14);
       break;
     default:
       assert(false && "Invalid test case number");
diff --git a/third-party/benchmark/test/clobber_memory_assembly_test.cc b/third-party/benchmark/test/clobber_memory_assembly_test.cc
--- a/third-party/benchmark/test/clobber_memory_assembly_test.cc
+++ b/third-party/benchmark/test/clobber_memory_assembly_test.cc
@@ -9,7 +9,6 @@
 extern int ExternInt;
 extern int ExternInt2;
 extern int ExternInt3;
-
 }
 
 // CHECK-LABEL: test_basic:
diff --git a/third-party/benchmark/test/complexity_test.cc b/third-party/benchmark/test/complexity_test.cc
--- a/third-party/benchmark/test/complexity_test.cc
+++ b/third-party/benchmark/test/complexity_test.cc
@@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <vector>
+
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
@@ -12,9 +13,10 @@
 #define ADD_COMPLEXITY_CASES(...) \
   int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
-int AddComplexityTest(std::string test_name, std::string big_o_test_name,
-                      std::string rms_test_name, std::string big_o,
-                      int family_index) {
+int AddComplexityTest(const std::string &test_name,
+                      const std::string &big_o_test_name,
+                      const std::string &rms_test_name,
+                      const std::string &big_o, int family_index) {
   SetSubstitutions({{"%name", test_name},
                     {"%bigo_name", big_o_test_name},
                     {"%rms_name", rms_test_name},
@@ -36,6 +38,7 @@
        {"\"repetitions\": %int,$", MR_Next},
        {"\"threads\": 1,$", MR_Next},
        {"\"aggregate_name\": \"BigO\",$", MR_Next},
+       {"\"aggregate_unit\": \"time\",$", MR_Next},
        {"\"cpu_coefficient\": %float,$", MR_Next},
        {"\"real_coefficient\": %float,$", MR_Next},
        {"\"big_o\": \"%bigo\",$", MR_Next},
@@ -49,6 +52,7 @@
        {"\"repetitions\": %int,$", MR_Next},
        {"\"threads\": 1,$", MR_Next},
        {"\"aggregate_name\": \"RMS\",$", MR_Next},
+       {"\"aggregate_unit\": \"percentage\",$", MR_Next},
        {"\"rms\": %float$", MR_Next},
        {"}", MR_Next}});
   AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
@@ -63,7 +67,7 @@
 // --------------------------- Testing BigO O(1) --------------------------- //
 // ========================================================================= //
 
-void BM_Complexity_O1(benchmark::State& state) {
+void BM_Complexity_O1(benchmark::State &state) {
   for (auto _ : state) {
     for (int i = 0; i < 1024; ++i) {
       benchmark::DoNotOptimize(&i);
@@ -112,7 +116,7 @@
   return v;
 }
 
-void BM_Complexity_O_N(benchmark::State& state) {
+void BM_Complexity_O_N(benchmark::State &state) {
   auto v = ConstructRandomVector(state.range(0));
   // Test worst case scenario (item not in vector)
   const int64_t item_not_in_vector = state.range(0) * 2;
@@ -154,7 +158,7 @@
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
 // ========================================================================= //
 
-static void BM_Complexity_O_N_log_N(benchmark::State& state) {
+static void BM_Complexity_O_N_log_N(benchmark::State &state) {
   auto v = ConstructRandomVector(state.range(0));
   for (auto _ : state) {
     std::sort(v.begin(), v.end());
@@ -197,7 +201,7 @@
 // -------- Testing formatting of Complexity with captured args ------------ //
 // ========================================================================= //
 
-void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
+void BM_ComplexityCaptureArgs(benchmark::State &state, int n) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
     benchmark::DoNotOptimize(state.iterations());
diff --git a/third-party/benchmark/test/cxx03_test.cc b/third-party/benchmark/test/cxx03_test.cc
--- a/third-party/benchmark/test/cxx03_test.cc
+++ b/third-party/benchmark/test/cxx03_test.cc
@@ -44,8 +44,7 @@
 BENCHMARK_TEMPLATE1(BM_template1, int);
 
 template <class T>
-struct BM_Fixture : public ::benchmark::Fixture {
-};
+struct BM_Fixture : public ::benchmark::Fixture {};
 
 BENCHMARK_TEMPLATE_F(BM_Fixture, BM_template1, long)(benchmark::State& state) {
   BM_empty(state);
@@ -55,8 +54,8 @@
 }
 
 void BM_counters(benchmark::State& state) {
-    BM_empty(state);
-    state.counters["Foo"] = 2;
+  BM_empty(state);
+  state.counters["Foo"] = 2;
 }
 BENCHMARK(BM_counters);
 
diff --git a/third-party/benchmark/test/diagnostics_test.cc b/third-party/benchmark/test/diagnostics_test.cc
--- a/third-party/benchmark/test/diagnostics_test.cc
+++ b/third-party/benchmark/test/diagnostics_test.cc
@@ -26,7 +26,8 @@
 }
 
 void try_invalid_pause_resume(benchmark::State& state) {
-#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
+#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && \
+    !defined(TEST_HAS_NO_EXCEPTIONS)
   try {
     state.PauseTiming();
     std::abort();
@@ -57,13 +58,12 @@
 }
 BENCHMARK(BM_diagnostic_test);
 
-
 void BM_diagnostic_test_keep_running(benchmark::State& state) {
   static bool called_once = false;
 
   if (called_once == false) try_invalid_pause_resume(state);
 
-  while(state.KeepRunning()) {
+  while (state.KeepRunning()) {
     benchmark::DoNotOptimize(state.iterations());
   }
 
diff --git a/third-party/benchmark/test/display_aggregates_only_test.cc b/third-party/benchmark/test/display_aggregates_only_test.cc
--- a/third-party/benchmark/test/display_aggregates_only_test.cc
+++ b/third-party/benchmark/test/display_aggregates_only_test.cc
@@ -19,21 +19,23 @@
 int main(int argc, char* argv[]) {
   const std::string output = GetFileReporterOutput(argc, argv);
 
-  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 6 ||
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 7 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3\"") != 3 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
           1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
-          1) {
-    std::cout << "Precondition mismatch. Expected to only find 6 "
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"") != 1) {
+    std::cout << "Precondition mismatch. Expected to only find 8 "
                  "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
                  "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
-                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"\nThe entire "
                  "output:\n";
     std::cout << output;
     return 1;
diff --git a/third-party/benchmark/test/donotoptimize_assembly_test.cc b/third-party/benchmark/test/donotoptimize_assembly_test.cc
--- a/third-party/benchmark/test/donotoptimize_assembly_test.cc
+++ b/third-party/benchmark/test/donotoptimize_assembly_test.cc
@@ -15,7 +15,7 @@
 struct NotTriviallyCopyable {
   NotTriviallyCopyable();
   explicit NotTriviallyCopyable(int x) : value(x) {}
-  NotTriviallyCopyable(NotTriviallyCopyable const&);
+  NotTriviallyCopyable(NotTriviallyCopyable const &);
   int value;
 };
 
@@ -23,7 +23,6 @@
   int value;
   int data[2];
 };
-
 }
 // CHECK-LABEL: test_with_rvalue:
 extern "C" void test_with_rvalue() {
@@ -118,8 +117,7 @@
 // CHECK-LABEL: test_inc_integer:
 extern "C" int test_inc_integer() {
   int x = 0;
-  for (int i=0; i < 5; ++i)
-    benchmark::DoNotOptimize(++x);
+  for (int i = 0; i < 5; ++i) benchmark::DoNotOptimize(++x);
   // CHECK: movl $1, [[DEST:.*]]
   // CHECK: {{(addl \$1,|incl)}} [[DEST]]
   // CHECK: {{(addl \$1,|incl)}} [[DEST]]
@@ -147,7 +145,7 @@
   // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
   // CHECK: ret
   int x = 42;
-  int * const xp = &x;
+  int *const xp = &x;
   benchmark::DoNotOptimize(xp);
 }
 
diff --git a/third-party/benchmark/test/donotoptimize_test.cc b/third-party/benchmark/test/donotoptimize_test.cc
--- a/third-party/benchmark/test/donotoptimize_test.cc
+++ b/third-party/benchmark/test/donotoptimize_test.cc
@@ -1,27 +1,28 @@
-#include "benchmark/benchmark.h"
-
 #include <cstdint>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 #if defined(__GNUC__)
 std::uint64_t double_up(const std::uint64_t x) __attribute__((const));
 #endif
 std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
-}
+}  // namespace
 
 // Using DoNotOptimize on types like BitRef seem to cause a lot of problems
 // with the inline assembly on both GCC and Clang.
 struct BitRef {
   int index;
-  unsigned char &byte;
+  unsigned char& byte;
 
-public:
+ public:
   static BitRef Make() {
     static unsigned char arr[2] = {};
     BitRef b(1, arr[0]);
     return b;
   }
-private:
+
+ private:
   BitRef(int i, unsigned char& b) : index(i), byte(b) {}
 };
 
diff --git a/third-party/benchmark/test/filter_test.cc b/third-party/benchmark/test/filter_test.cc
--- a/third-party/benchmark/test/filter_test.cc
+++ b/third-party/benchmark/test/filter_test.cc
@@ -70,7 +70,7 @@
 }
 BENCHMARK(BM_FooBa);
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   bool list_only = false;
   for (int i = 0; i < argc; ++i)
     list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
diff --git a/third-party/benchmark/test/fixture_test.cc b/third-party/benchmark/test/fixture_test.cc
--- a/third-party/benchmark/test/fixture_test.cc
+++ b/third-party/benchmark/test/fixture_test.cc
@@ -1,22 +1,22 @@
 
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <memory>
 
+#include "benchmark/benchmark.h"
+
 #define FIXTURE_BECHMARK_NAME MyFixture
 
 class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
  public:
   void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
-    if (state.thread_index == 0) {
+    if (state.thread_index() == 0) {
       assert(data.get() == nullptr);
       data.reset(new int(42));
     }
   }
 
   void TearDown(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
-    if (state.thread_index == 0) {
+    if (state.thread_index() == 0) {
       assert(data.get() != nullptr);
       data.reset();
     }
@@ -27,7 +27,7 @@
   std::unique_ptr<int> data;
 };
 
-BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State &st) {
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State& st) {
   assert(data.get() != nullptr);
   assert(*data == 42);
   for (auto _ : st) {
@@ -35,7 +35,7 @@
 }
 
 BENCHMARK_DEFINE_F(FIXTURE_BECHMARK_NAME, Bar)(benchmark::State& st) {
-  if (st.thread_index == 0) {
+  if (st.thread_index() == 0) {
     assert(data.get() != nullptr);
     assert(*data == 42);
   }
diff --git a/third-party/benchmark/test/internal_threading_test.cc b/third-party/benchmark/test/internal_threading_test.cc
--- a/third-party/benchmark/test/internal_threading_test.cc
+++ b/third-party/benchmark/test/internal_threading_test.cc
@@ -3,6 +3,7 @@
 
 #include <chrono>
 #include <thread>
+
 #include "../src/timers.h"
 #include "benchmark/benchmark.h"
 #include "output_test.h"
diff --git a/third-party/benchmark/test/map_test.cc b/third-party/benchmark/test/map_test.cc
--- a/third-party/benchmark/test/map_test.cc
+++ b/third-party/benchmark/test/map_test.cc
@@ -1,8 +1,8 @@
-#include "benchmark/benchmark.h"
-
 #include <cstdlib>
 #include <map>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 
 std::map<int, int> ConstructRandomMap(int size) {
diff --git a/third-party/benchmark/test/multiple_ranges_test.cc b/third-party/benchmark/test/multiple_ranges_test.cc
--- a/third-party/benchmark/test/multiple_ranges_test.cc
+++ b/third-party/benchmark/test/multiple_ranges_test.cc
@@ -1,10 +1,10 @@
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <iostream>
 #include <set>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+
 class MultipleRangesFixture : public ::benchmark::Fixture {
  public:
   MultipleRangesFixture()
@@ -42,7 +42,7 @@
   virtual ~MultipleRangesFixture() {
     if (actualValues != expectedValues) {
       std::cout << "EXPECTED\n";
-      for (auto v : expectedValues) {
+      for (const auto& v : expectedValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
@@ -50,7 +50,7 @@
         std::cout << "}\n";
       }
       std::cout << "ACTUAL\n";
-      for (auto v : actualValues) {
+      for (const auto& v : actualValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
diff --git a/third-party/benchmark/test/options_test.cc b/third-party/benchmark/test/options_test.cc
--- a/third-party/benchmark/test/options_test.cc
+++ b/third-party/benchmark/test/options_test.cc
@@ -1,7 +1,8 @@
-#include "benchmark/benchmark.h"
 #include <chrono>
 #include <thread>
 
+#include "benchmark/benchmark.h"
+
 #if defined(NDEBUG)
 #undef NDEBUG
 #endif
@@ -65,11 +66,9 @@
   // Test that the requested iteration count is respected.
   assert(state.max_iterations == 42);
   size_t actual_iterations = 0;
-  for (auto _ : state)
-    ++actual_iterations;
+  for (auto _ : state) ++actual_iterations;
   assert(state.iterations() == state.max_iterations);
   assert(state.iterations() == 42);
-
 }
 BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
 
diff --git a/third-party/benchmark/test/output_test.h b/third-party/benchmark/test/output_test.h
--- a/third-party/benchmark/test/output_test.h
+++ b/third-party/benchmark/test/output_test.h
@@ -85,7 +85,7 @@
 struct Results;
 typedef std::function<void(Results const&)> ResultsCheckFn;
 
-size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
+size_t AddChecker(const char* bm_name_pattern, const ResultsCheckFn& fn);
 
 // Class holding the results of a benchmark.
 // It is passed in calls to checker functions.
@@ -113,9 +113,7 @@
     return NumIterations() * GetTime(kRealTime);
   }
   // get the cpu_time duration of the benchmark in seconds
-  double DurationCPUTime() const {
-    return NumIterations() * GetTime(kCpuTime);
-  }
+  double DurationCPUTime() const { return NumIterations() * GetTime(kCpuTime); }
 
   // get the string for a result by name, or nullptr if the name
   // is not found
@@ -143,12 +141,12 @@
 template <class T>
 T Results::GetAs(const char* entry_name) const {
   auto* sv = Get(entry_name);
-  CHECK(sv != nullptr && !sv->empty());
+  BM_CHECK(sv != nullptr && !sv->empty());
   std::stringstream ss;
   ss << *sv;
   T out;
   ss >> out;
-  CHECK(!ss.fail());
+  BM_CHECK(!ss.fail());
   return out;
 }
 
@@ -159,7 +157,7 @@
 // clang-format off
 
 #define CHECK_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value) \
-    CONCAT(CHECK_, relationship)                                        \
+    CONCAT(BM_CHECK_, relationship)                                        \
     (entry.getfn< var_type >(var_name), (value)) << "\n"                \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
     << __FILE__ << ":" << __LINE__ << ": "                              \
@@ -170,7 +168,7 @@
 // check with tolerance. eps_factor is the tolerance window, which is
 // interpreted relative to value (eg, 0.1 means 10% of value).
 #define CHECK_FLOAT_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
-    CONCAT(CHECK_FLOAT_, relationship)                                  \
+    CONCAT(BM_CHECK_FLOAT_, relationship)                                  \
     (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
     << __FILE__ << ":" << __LINE__ << ": "                              \
diff --git a/third-party/benchmark/test/output_test_helper.cc b/third-party/benchmark/test/output_test_helper.cc
--- a/third-party/benchmark/test/output_test_helper.cc
+++ b/third-party/benchmark/test/output_test_helper.cc
@@ -10,6 +10,7 @@
 
 #include "../src/benchmark_api_internal.h"
 #include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "../src/log.h"    // NOTE: log.h is for internal use only
 #include "../src/re.h"     // NOTE: re.h is for internal use only
 #include "output_test.h"
 
@@ -40,14 +41,17 @@
   // clang-format off
   static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
   static std::string time_re = "([0-9]+[.])?[0-9]+";
+  static std::string percentage_re = "[0-9]+[.][0-9]{2}";
   static SubMap map = {
       {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
       // human-readable float
       {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
+      {"%percentage", percentage_re},
       {"%int", "[ ]*[0-9]+"},
       {" %s ", "[ ]+"},
       {"%time", "[ ]*" + time_re + "[ ]+ns"},
       {"%console_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns [ ]*[0-9]+"},
+      {"%console_percentage_report", "[ ]*" + percentage_re + "[ ]+% [ ]*" + percentage_re + "[ ]+% [ ]*[0-9]+"},
       {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
       {"%console_ms_report", "[ ]*" + time_re + "[ ]+ms [ ]*" + time_re + "[ ]+ms [ ]*[0-9]+"},
       {"%console_s_report", "[ ]*" + time_re + "[ ]+s [ ]*" + time_re + "[ ]+s [ ]*[0-9]+"},
@@ -94,27 +98,27 @@
   bool on_first = true;
   std::string line;
   while (remaining_output.eof() == false) {
-    CHECK(remaining_output.good());
+    BM_CHECK(remaining_output.good());
     std::getline(remaining_output, line);
     if (on_first) {
       first_line = line;
       on_first = false;
     }
     for (const auto& NC : not_checks) {
-      CHECK(!NC.regex->Match(line))
+      BM_CHECK(!NC.regex->Match(line))
           << "Unexpected match for line \"" << line << "\" for MR_Not regex \""
           << NC.regex_str << "\""
           << "\n    actual regex string \"" << TC.substituted_regex << "\""
           << "\n    started matching near: " << first_line;
     }
     if (TC.regex->Match(line)) return;
-    CHECK(TC.match_rule != MR_Next)
+    BM_CHECK(TC.match_rule != MR_Next)
         << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str
         << "\""
         << "\n    actual regex string \"" << TC.substituted_regex << "\""
         << "\n    started matching near: " << first_line;
   }
-  CHECK(remaining_output.eof() == false)
+  BM_CHECK(remaining_output.eof() == false)
       << "End of output reached before match for regex \"" << TC.regex_str
       << "\" was found"
       << "\n    actual regex string \"" << TC.substituted_regex << "\""
@@ -137,14 +141,14 @@
 class TestReporter : public benchmark::BenchmarkReporter {
  public:
   TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
-      : reporters_(reps) {}
+      : reporters_(std::move(reps)) {}
 
   virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
     bool last_ret = false;
     bool first = true;
     for (auto rep : reporters_) {
       bool new_ret = rep->ReportContext(context);
-      CHECK(first || new_ret == last_ret)
+      BM_CHECK(first || new_ret == last_ret)
           << "Reports return different values for ReportContext";
       first = false;
       last_ret = new_ret;
@@ -179,7 +183,7 @@
  public:
   struct PatternAndFn : public TestCase {  // reusing TestCase for its regexes
     PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
-        : TestCase(rx), fn(fn_) {}
+        : TestCase(rx), fn(std::move(fn_)) {}
     ResultsCheckFn fn;
   };
 
@@ -187,7 +191,7 @@
   std::vector<Results> results;
   std::vector<std::string> field_names;
 
-  void Add(const std::string& entry_pattern, ResultsCheckFn fn);
+  void Add(const std::string& entry_pattern, const ResultsCheckFn& fn);
 
   void CheckResults(std::stringstream& output);
 
@@ -206,7 +210,8 @@
 }
 
 // add a results checker for a benchmark
-void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
+void ResultsChecker::Add(const std::string& entry_pattern,
+                         const ResultsCheckFn& fn) {
   check_patterns.emplace_back(entry_pattern, fn);
 }
 
@@ -226,7 +231,7 @@
   std::string line;
   bool on_first = true;
   while (output.eof() == false) {
-    CHECK(output.good());
+    BM_CHECK(output.good());
     std::getline(output, line);
     if (on_first) {
       SetHeader_(line);  // this is important
@@ -237,18 +242,18 @@
   }
   // finally we can call the subscribed check functions
   for (const auto& p : check_patterns) {
-    VLOG(2) << "--------------------------------\n";
-    VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
+    BM_VLOG(2) << "--------------------------------\n";
+    BM_VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
     for (const auto& r : results) {
       if (!p.regex->Match(r.name)) {
-        VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
+        BM_VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
         continue;
       } else {
-        VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
+        BM_VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
       }
-      VLOG(1) << "Checking results of " << r.name << ": ... \n";
+      BM_VLOG(1) << "Checking results of " << r.name << ": ... \n";
       p.fn(r);
-      VLOG(1) << "Checking results of " << r.name << ": OK.\n";
+      BM_VLOG(1) << "Checking results of " << r.name << ": OK.\n";
     }
   }
 }
@@ -261,9 +266,9 @@
 // set the values for a benchmark
 void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
   if (entry_csv_line.empty()) return;  // some lines are empty
-  CHECK(!field_names.empty());
+  BM_CHECK(!field_names.empty());
   auto vals = SplitCsv_(entry_csv_line);
-  CHECK_EQ(vals.size(), field_names.size());
+  BM_CHECK_EQ(vals.size(), field_names.size());
   results.emplace_back(vals[0]);  // vals[0] is the benchmark name
   auto& entry = results.back();
   for (size_t i = 1, e = vals.size(); i < e; ++i) {
@@ -278,7 +283,7 @@
   if (!field_names.empty()) out.reserve(field_names.size());
   size_t prev = 0, pos = line.find_first_of(','), curr = pos;
   while (pos != line.npos) {
-    CHECK(curr > 0);
+    BM_CHECK(curr > 0);
     if (line[prev] == '"') ++prev;
     if (line[curr - 1] == '"') --curr;
     out.push_back(line.substr(prev, curr - prev));
@@ -295,7 +300,7 @@
 
 }  // end namespace internal
 
-size_t AddChecker(const char* bm_name, ResultsCheckFn fn) {
+size_t AddChecker(const char* bm_name, const ResultsCheckFn& fn) {
   auto& rc = internal::GetResultsChecker();
   rc.Add(bm_name, fn);
   return rc.results.size();
@@ -309,20 +314,18 @@
   ss << name.substr(pos + 9, end);
   int num = 1;
   ss >> num;
-  CHECK(!ss.fail());
+  BM_CHECK(!ss.fail());
   return num;
 }
 
-double Results::NumIterations() const {
-  return GetAs<double>("iterations");
-}
+double Results::NumIterations() const { return GetAs<double>("iterations"); }
 
 double Results::GetTime(BenchmarkTime which) const {
-  CHECK(which == kCpuTime || which == kRealTime);
+  BM_CHECK(which == kCpuTime || which == kRealTime);
   const char* which_str = which == kCpuTime ? "cpu_time" : "real_time";
   double val = GetAs<double>(which_str);
   auto unit = Get("time_unit");
-  CHECK(unit);
+  BM_CHECK(unit);
   if (*unit == "ns") {
     return val * 1.e-9;
   } else if (*unit == "us") {
@@ -332,7 +335,7 @@
   } else if (*unit == "s") {
     return val;
   } else {
-    CHECK(1 == 0) << "unknown time unit: " << *unit;
+    BM_CHECK(1 == 0) << "unknown time unit: " << *unit;
     return 0;
   }
 }
@@ -348,10 +351,10 @@
       regex(std::make_shared<benchmark::Regex>()) {
   std::string err_str;
   regex->Init(substituted_regex, &err_str);
-  CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex
-                         << "\""
-                         << "\n    originally \"" << regex_str << "\""
-                         << "\n    got error: " << err_str;
+  BM_CHECK(err_str.empty())
+      << "Could not construct regex \"" << substituted_regex << "\""
+      << "\n    originally \"" << regex_str << "\""
+      << "\n    got error: " << err_str;
 }
 
 int AddCases(TestCaseID ID, std::initializer_list<TestCase> il) {
@@ -380,10 +383,8 @@
 
 // Disable deprecated warnings temporarily because we need to reference
 // CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
+
 void RunOutputTests(int argc, char* argv[]) {
   using internal::GetTestCaseList;
   benchmark::Initialize(&argc, argv);
@@ -438,13 +439,11 @@
   // the checks to subscribees.
   auto& csv = TestCases[2];
   // would use == but gcc spits a warning
-  CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
+  BM_CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
   internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
 
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
+BENCHMARK_RESTORE_DEPRECATED_WARNING
 
 int SubstrCnt(const std::string& haystack, const std::string& pat) {
   if (pat.length() == 0) return 0;
@@ -468,9 +467,8 @@
 
 static std::string GetRandomFileName() {
   std::string model = "test.%%%%%%";
-  for (auto & ch :  model) {
-    if (ch == '%')
-      ch = RandomHexChar();
+  for (auto& ch : model) {
+    if (ch == '%') ch = RandomHexChar();
   }
   return model;
 }
@@ -487,8 +485,7 @@
   int retries = 3;
   while (--retries) {
     std::string name = GetRandomFileName();
-    if (!FileExists(name))
-      return name;
+    if (!FileExists(name)) return name;
   }
   std::cerr << "Failed to create unique temporary file name" << std::endl;
   std::abort();
diff --git a/third-party/benchmark/test/perf_counters_gtest.cc b/third-party/benchmark/test/perf_counters_gtest.cc
--- a/third-party/benchmark/test/perf_counters_gtest.cc
+++ b/third-party/benchmark/test/perf_counters_gtest.cc
@@ -5,7 +5,7 @@
 
 #ifndef GTEST_SKIP
 struct MsgHandler {
-  void operator=(std::ostream&){}
+  void operator=(std::ostream&) {}
 };
 #define GTEST_SKIP() return MsgHandler() = std::cout
 #endif
@@ -103,10 +103,10 @@
 
 void measure(size_t threadcount, PerfCounterValues* values1,
              PerfCounterValues* values2) {
-  CHECK_NE(values1, nullptr);
-  CHECK_NE(values2, nullptr);
+  BM_CHECK_NE(values1, nullptr);
+  BM_CHECK_NE(values2, nullptr);
   std::vector<std::thread> threads(threadcount);
-  auto work = [&]() { CHECK(do_work() > 1000); };
+  auto work = [&]() { BM_CHECK(do_work() > 1000); };
 
   // We need to first set up the counters, then start the threads, so the
   // threads would inherit the counters. But later, we need to first destroy the
diff --git a/third-party/benchmark/test/perf_counters_test.cc b/third-party/benchmark/test/perf_counters_test.cc
--- a/third-party/benchmark/test/perf_counters_test.cc
+++ b/third-party/benchmark/test/perf_counters_test.cc
@@ -5,7 +5,7 @@
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
-void BM_Simple(benchmark::State& state) {
+static void BM_Simple(benchmark::State& state) {
   for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
   }
@@ -13,7 +13,7 @@
 BENCHMARK(BM_Simple);
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Simple\",$"}});
 
-void CheckSimple(Results const& e) {
+static void CheckSimple(Results const& e) {
   CHECK_COUNTER_VALUE(e, double, "CYCLES", GT, 0);
   CHECK_COUNTER_VALUE(e, double, "BRANCHES", GT, 0.0);
 }
diff --git a/third-party/benchmark/test/register_benchmark_test.cc b/third-party/benchmark/test/register_benchmark_test.cc
--- a/third-party/benchmark/test/register_benchmark_test.cc
+++ b/third-party/benchmark/test/register_benchmark_test.cc
@@ -30,13 +30,13 @@
 
   void CheckRun(Run const& run) const {
     // clang-format off
-    CHECK(name == run.benchmark_name()) << "expected " << name << " got "
+    BM_CHECK(name == run.benchmark_name()) << "expected " << name << " got "
                                       << run.benchmark_name();
     if (label) {
-      CHECK(run.report_label == label) << "expected " << label << " got "
+      BM_CHECK(run.report_label == label) << "expected " << label << " got "
                                        << run.report_label;
     } else {
-      CHECK(run.report_label == "");
+      BM_CHECK(run.report_label.empty());
     }
     // clang-format on
   }
@@ -45,7 +45,7 @@
 std::vector<TestCase> ExpectedResults;
 
 int AddCases(std::initializer_list<TestCase> const& v) {
-  for (auto N : v) {
+  for (const auto& N : v) {
     ExpectedResults.push_back(N);
   }
   return 0;
diff --git a/third-party/benchmark/test/repetitions_test.cc b/third-party/benchmark/test/repetitions_test.cc
--- a/third-party/benchmark/test/repetitions_test.cc
+++ b/third-party/benchmark/test/repetitions_test.cc
@@ -6,7 +6,7 @@
 // ------------------------ Testing Basic Output --------------------------- //
 // ========================================================================= //
 
-void BM_ExplicitRepetitions(benchmark::State& state) {
+static void BM_ExplicitRepetitions(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
@@ -59,6 +59,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -73,6 +74,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -87,6 +89,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -105,7 +108,7 @@
 // ------------------------ Testing Basic Output --------------------------- //
 // ========================================================================= //
 
-void BM_ImplicitRepetitions(benchmark::State& state) {
+static void BM_ImplicitRepetitions(benchmark::State& state) {
   for (auto _ : state) {
   }
 }
@@ -164,6 +167,7 @@
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -177,6 +181,7 @@
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -190,6 +195,7 @@
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
diff --git a/third-party/benchmark/test/report_aggregates_only_test.cc b/third-party/benchmark/test/report_aggregates_only_test.cc
--- a/third-party/benchmark/test/report_aggregates_only_test.cc
+++ b/third-party/benchmark/test/report_aggregates_only_test.cc
@@ -19,17 +19,19 @@
 int main(int argc, char* argv[]) {
   const std::string output = GetFileReporterOutput(argc, argv);
 
-  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 3 ||
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 4 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
           1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
-          1) {
-    std::cout << "Precondition mismatch. Expected to only find three "
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"") != 1) {
+    std::cout << "Precondition mismatch. Expected to only find four "
                  "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
                  "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
-                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"\nThe entire "
                  "output:\n";
     std::cout << output;
     return 1;
diff --git a/third-party/benchmark/test/reporter_output_test.cc b/third-party/benchmark/test/reporter_output_test.cc
--- a/third-party/benchmark/test/reporter_output_test.cc
+++ b/third-party/benchmark/test/reporter_output_test.cc
@@ -1,5 +1,6 @@
 
 #undef NDEBUG
+#include <numeric>
 #include <utility>
 
 #include "benchmark/benchmark.h"
@@ -454,6 +455,7 @@
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
                        {"\"family_index\": 15,$", MR_Next},
@@ -463,6 +465,7 @@
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"},
                        {"\"family_index\": 15,$", MR_Next},
@@ -472,6 +475,7 @@
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2\",%csv_report$"},
@@ -519,6 +523,7 @@
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
                        {"\"family_index\": 16,$", MR_Next},
@@ -528,6 +533,7 @@
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"},
                        {"\"family_index\": 16,$", MR_Next},
@@ -537,6 +543,7 @@
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
@@ -594,6 +601,7 @@
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
                        {"\"family_index\": 17,$", MR_Next},
@@ -603,6 +611,7 @@
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"},
                        {"\"family_index\": 17,$", MR_Next},
@@ -612,6 +621,7 @@
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:4\",%csv_report$"},
@@ -661,6 +671,7 @@
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
            {"\"family_index\": 19,$", MR_Next},
@@ -670,6 +681,7 @@
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"},
            {"\"family_index\": 19,$", MR_Next},
@@ -679,6 +691,7 @@
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
                       {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
@@ -709,6 +722,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_median\",$"},
            {"\"family_index\": 20,$", MR_Next},
@@ -718,6 +732,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_stddev\",$"},
            {"\"family_index\": 20,$", MR_Next},
@@ -727,6 +742,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut,
           {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
@@ -761,6 +777,7 @@
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
@@ -771,6 +788,7 @@
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
@@ -781,6 +799,7 @@
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"}});
 ADD_CASES(TC_CSVOut,
@@ -869,6 +888,7 @@
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_median\",$"},
@@ -880,6 +900,7 @@
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_stddev\",$"},
@@ -891,6 +912,7 @@
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": %float,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_\",$"},
@@ -902,6 +924,7 @@
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next}});
 ADD_CASES(
@@ -916,6 +939,154 @@
       "manual_time_stddev\",%csv_report$"},
      {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_\",%csv_report$"}});
 
+// ========================================================================= //
+// ------------- Testing relative standard deviation statistics ------------ //
+// ========================================================================= //
+
+const auto UserPercentStatistics = [](const std::vector<double>&) {
+  return 1. / 100.;
+};
+void BM_UserPercentStats(benchmark::State& state) {
+  for (auto _ : state) {
+    state.SetIterationTime(150 / 10e8);
+  }
+}
+// clang-format off
+BENCHMARK(BM_UserPercentStats)
+  ->Repetitions(3)
+  ->Iterations(5)
+  ->UseManualTime()
+  ->Unit(benchmark::TimeUnit::kNanosecond)
+  ->ComputeStatistics("", UserPercentStatistics, benchmark::StatisticUnit::kPercentage);
+// clang-format on
+
+// check that UserPercent-provided stats is calculated, and is after the
+// default-ones empty string as name is intentional, it would sort before
+// anything else
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_UserPercentStats/iterations:5/repeats:3/manual_time [ "
+            "]* 150 ns %time [ ]*5$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/manual_time [ "
+            "]* 150 ns %time [ ]*5$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/manual_time [ "
+            "]* 150 ns %time [ ]*5$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/"
+            "manual_time_mean [ ]* 150 ns %time [ ]*3$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/"
+            "manual_time_median [ ]* 150 ns %time [ ]*3$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/"
+            "manual_time_stddev [ ]* 0.000 ns %time [ ]*3$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/manual_time_ "
+            "[ ]* 1.00 % [ ]* 1.00 %[ ]*3$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 1,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 2,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.(0)*e-(0)*2,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_mean\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_median\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_stddev\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_\",%csv_report$"}});
+
 // ========================================================================= //
 // ------------------------- Testing StrEscape JSON ------------------------ //
 // ========================================================================= //
diff --git a/third-party/benchmark/test/skip_with_error_test.cc b/third-party/benchmark/test/skip_with_error_test.cc
--- a/third-party/benchmark/test/skip_with_error_test.cc
+++ b/third-party/benchmark/test/skip_with_error_test.cc
@@ -33,14 +33,14 @@
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name())
+    BM_CHECK(name == run.benchmark_name())
         << "expected " << name << " got " << run.benchmark_name();
-    CHECK(error_occurred == run.error_occurred);
-    CHECK(error_message == run.error_message);
+    BM_CHECK(error_occurred == run.error_occurred);
+    BM_CHECK(error_message == run.error_message);
     if (error_occurred) {
-      // CHECK(run.iterations == 0);
+      // BM_CHECK(run.iterations == 0);
     } else {
-      CHECK(run.iterations != 0);
+      BM_CHECK(run.iterations != 0);
     }
   }
 };
@@ -97,7 +97,7 @@
 void BM_error_during_running(benchmark::State& state) {
   int first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index() <= (state.threads() / 2)) {
       assert(first_iter);
       first_iter = false;
       state.SkipWithError("error message");
@@ -119,12 +119,13 @@
 
 void BM_error_during_running_ranged_for(benchmark::State& state) {
   assert(state.max_iterations > 3 && "test requires at least a few iterations");
-  int first_iter = true;
+  bool first_iter = true;
   // NOTE: Users should not write the for loop explicitly.
   for (auto It = state.begin(), End = state.end(); It != End; ++It) {
     if (state.range(0) == 1) {
       assert(first_iter);
       first_iter = false;
+      (void)first_iter;
       state.SkipWithError("error message");
       // Test the unfortunate but documented behavior that the ranged-for loop
       // doesn't automatically terminate when SkipWithError is set.
@@ -142,7 +143,7 @@
   for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
   }
-  if (state.thread_index <= (state.threads / 2))
+  if (state.thread_index() <= (state.threads() / 2))
     state.SkipWithError("error message");
 }
 BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
@@ -154,7 +155,7 @@
 void BM_error_while_paused(benchmark::State& state) {
   bool first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index() <= (state.threads() / 2)) {
       assert(first_iter);
       first_iter = false;
       state.PauseTiming();
diff --git a/third-party/benchmark/test/spec_arg_test.cc b/third-party/benchmark/test/spec_arg_test.cc
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/test/spec_arg_test.cc
@@ -0,0 +1,95 @@
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+// Tests that we can override benchmark-spec value from FLAGS_benchmark_filter
+// with argument to RunSpecifiedBenchmarks(...).
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+    assert(report.size() == 1);
+    matched_functions.push_back(report[0].run_name.function_name);
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  TestReporter() {}
+
+  virtual ~TestReporter() {}
+
+  const std::vector<std::string>& GetMatchedFunctions() const {
+    return matched_functions;
+  }
+
+ private:
+  std::vector<std::string> matched_functions;
+};
+
+}  // end namespace
+
+static void BM_NotChosen(benchmark::State& state) {
+  assert(false && "SHOULD NOT BE CALLED");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_NotChosen);
+
+static void BM_Chosen(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_Chosen);
+
+int main(int argc, char** argv) {
+  const std::string flag = "BM_NotChosen";
+
+  // Verify that argv specify --benchmark_filter=BM_NotChosen.
+  bool found = false;
+  for (int i = 0; i < argc; ++i) {
+    if (strcmp("--benchmark_filter=BM_NotChosen", argv[i]) == 0) {
+      found = true;
+      break;
+    }
+  }
+  assert(found);
+
+  benchmark::Initialize(&argc, argv);
+
+  // Check that the current flag value is reported accurately via the
+  // GetBenchmarkFilter() function.
+  if (flag != benchmark::GetBenchmarkFilter()) {
+    std::cerr
+        << "Seeing different value for flags. GetBenchmarkFilter() returns ["
+        << benchmark::GetBenchmarkFilter() << "] expected flag=[" << flag
+        << "]\n";
+    return 1;
+  }
+  TestReporter test_reporter;
+  const char* const spec = "BM_Chosen";
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter, spec);
+  assert(returned_count == 1);
+  const std::vector<std::string> matched_functions =
+      test_reporter.GetMatchedFunctions();
+  assert(matched_functions.size() == 1);
+  if (strcmp(spec, matched_functions.front().c_str()) != 0) {
+    std::cerr << "Expected benchmark [" << spec << "] to run, but got ["
+              << matched_functions.front() << "]\n";
+    return 2;
+  }
+  return 0;
+}
diff --git a/third-party/benchmark/test/statistics_gtest.cc b/third-party/benchmark/test/statistics_gtest.cc
--- a/third-party/benchmark/test/statistics_gtest.cc
+++ b/third-party/benchmark/test/statistics_gtest.cc
@@ -25,4 +25,11 @@
                    1.151086443322134);
 }
 
+TEST(StatisticsTest, CV) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({101, 101, 101, 101}), 0.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({1, 2, 3}), 1. / 2.);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({2.5, 2.4, 3.3, 4.2, 5.1}),
+                   0.32888184094918121);
+}
+
 }  // end namespace
diff --git a/third-party/benchmark/test/string_util_gtest.cc b/third-party/benchmark/test/string_util_gtest.cc
--- a/third-party/benchmark/test/string_util_gtest.cc
+++ b/third-party/benchmark/test/string_util_gtest.cc
@@ -2,8 +2,8 @@
 // statistics_test - Unit tests for src/statistics.cc
 //===---------------------------------------------------------------------===//
 
-#include "../src/string_util.h"
 #include "../src/internal_macros.h"
+#include "../src/string_util.h"
 #include "gtest/gtest.h"
 
 namespace {
@@ -32,7 +32,8 @@
 #elif ULONG_MAX == 0xFFFFFFFFFFFFFFFFul
   {
     size_t pos = 0;
-    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul, benchmark::stoul("18446744073709551615", &pos));
+    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul,
+              benchmark::stoul("18446744073709551615", &pos));
     EXPECT_EQ(20ul, pos);
   }
 #endif
@@ -62,91 +63,81 @@
     EXPECT_EQ(4ul, pos);
   }
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  {
-    ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument);
-  }
+  { ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument); }
 #endif
 }
 
-TEST(StringUtilTest, stoi) {
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0, benchmark::stoi("0", &pos));
-    EXPECT_EQ(1ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
-    EXPECT_EQ(4ul, pos);
-  }
+TEST(StringUtilTest, stoi){{size_t pos = 0;
+EXPECT_EQ(0, benchmark::stoi("0", &pos));
+EXPECT_EQ(1ul, pos);
+}  // namespace
+{
+  size_t pos = 0;
+  EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
+  EXPECT_EQ(3ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
+  EXPECT_EQ(4ul, pos);
+}
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  {
-    ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument);
-  }
+{ ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument); }
 #endif
 }
 
-TEST(StringUtilTest, stod) {
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0.0, benchmark::stod("0", &pos));
-    EXPECT_EQ(1ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-  {
-    size_t pos = 0;
-    /* Note: exactly representable as double */
-    EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
-    EXPECT_EQ(8ul, pos);
-  }
+TEST(StringUtilTest, stod){{size_t pos = 0;
+EXPECT_EQ(0.0, benchmark::stod("0", &pos));
+EXPECT_EQ(1ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
+  EXPECT_EQ(3ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
+  EXPECT_EQ(3ul, pos);
+}
+{
+  size_t pos = 0;
+  /* Note: exactly representable as double */
+  EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
+  EXPECT_EQ(8ul, pos);
+}
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  {
-    ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument);
-  }
+{ ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument); }
 #endif
 }
 
diff --git a/third-party/benchmark/test/templated_fixture_test.cc b/third-party/benchmark/test/templated_fixture_test.cc
--- a/third-party/benchmark/test/templated_fixture_test.cc
+++ b/third-party/benchmark/test/templated_fixture_test.cc
@@ -1,9 +1,9 @@
 
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <memory>
 
+#include "benchmark/benchmark.h"
+
 template <typename T>
 class MyFixture : public ::benchmark::Fixture {
  public:
diff --git a/third-party/benchmark/test/user_counters_tabular_test.cc b/third-party/benchmark/test/user_counters_tabular_test.cc
--- a/third-party/benchmark/test/user_counters_tabular_test.cc
+++ b/third-party/benchmark/test/user_counters_tabular_test.cc
@@ -18,12 +18,14 @@
       {"^BM_Counters_Tabular/repeats:2/threads:1 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
       {"^BM_Counters_Tabular/repeats:2/threads:1_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
       {"^BM_Counters_Tabular/repeats:2/threads:1_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-      {"^BM_Counters_Tabular/repeats:2/threads:1_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:1_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:1_cv %console_percentage_report [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*%$", MR_Next},
       {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
       {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
       {"^BM_Counters_Tabular/repeats:2/threads:2_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
       {"^BM_Counters_Tabular/repeats:2/threads:2_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-      {"^BM_Counters_Tabular/repeats:2/threads:2_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:2_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:2_cv %console_percentage_report [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*%$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
@@ -125,6 +127,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -146,6 +149,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -167,6 +171,29 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_cv\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"cv\",$", MR_Next},
+           {"\"aggregate_unit\": \"percentage\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -231,6 +258,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 2,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -252,6 +280,29 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 2,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_cv\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"cv\",$", MR_Next},
+           {"\"aggregate_unit\": \"percentage\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -278,6 +329,9 @@
 ADD_CASES(TC_CSVOut,
           {{"^\"BM_Counters_Tabular/repeats:2/threads:1_stddev\",%csv_report,"
             "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_cv\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
 ADD_CASES(TC_CSVOut,
           {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
             "%float,%float,%float,%float,%float,%float$"}});
@@ -293,6 +347,9 @@
 ADD_CASES(TC_CSVOut,
           {{"^\"BM_Counters_Tabular/repeats:2/threads:2_stddev\",%csv_report,"
             "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_cv\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckTabular(Results const& e) {
diff --git a/third-party/benchmark/test/user_counters_test.cc b/third-party/benchmark/test/user_counters_test.cc
--- a/third-party/benchmark/test/user_counters_test.cc
+++ b/third-party/benchmark/test/user_counters_test.cc
@@ -26,7 +26,7 @@
   for (auto _ : state) {
   }
   state.counters["foo"] = 1;
-  state.counters["bar"] = 2 * (double)state.iterations();
+  state.counters["bar"] = 2 * static_cast<double>(state.iterations());
 }
 BENCHMARK(BM_Counters_Simple);
 ADD_CASES(TC_ConsoleOut,
diff --git a/third-party/benchmark/test/user_counters_thousands_test.cc b/third-party/benchmark/test/user_counters_thousands_test.cc
--- a/third-party/benchmark/test/user_counters_thousands_test.cc
+++ b/third-party/benchmark/test/user_counters_thousands_test.cc
@@ -96,6 +96,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -115,6 +116,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -134,6 +136,7 @@
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
diff --git a/third-party/benchmark/tools/gbench/Inputs/test4_run0.json b/third-party/benchmark/tools/gbench/Inputs/test4_run0.json
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/tools/gbench/Inputs/test4_run0.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.01,
+      "cpu_time": 0.10,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/third-party/benchmark/tools/gbench/Inputs/test4_run1.json b/third-party/benchmark/tools/gbench/Inputs/test4_run1.json
new file mode 100644
--- /dev/null
+++ b/third-party/benchmark/tools/gbench/Inputs/test4_run1.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.005,
+      "cpu_time": 0.15,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/third-party/benchmark/tools/gbench/report.py b/third-party/benchmark/tools/gbench/report.py
--- a/third-party/benchmark/tools/gbench/report.py
+++ b/third-party/benchmark/tools/gbench/report.py
@@ -7,7 +7,9 @@
 import copy
 import random
 
-from scipy.stats import mannwhitneyu
+from scipy.stats import mannwhitneyu, gmean
+from numpy import array
+from pandas import Timedelta
 
 
 class BenchmarkColor(object):
@@ -150,6 +152,30 @@
     return partitions
 
 
+def get_timedelta_field_as_seconds(benchmark, field_name):
+    """
+    Get value of field_name field of benchmark, which is time with time unit
+    time_unit, as time in seconds.
+    """
+    time_unit = benchmark['time_unit'] if 'time_unit' in benchmark else 's'
+    dt = Timedelta(benchmark[field_name], time_unit)
+    return dt / Timedelta(1, 's')
+
+
+def calculate_geomean(json):
+    """
+    Extract all real/cpu times from all the benchmarks as seconds,
+    and calculate their geomean.
+    """
+    times = []
+    for benchmark in json['benchmarks']:
+        if 'run_type' in benchmark and benchmark['run_type'] == 'aggregate':
+            continue
+        times.append([get_timedelta_field_as_seconds(benchmark, 'real_time'),
+                      get_timedelta_field_as_seconds(benchmark, 'cpu_time')])
+    return gmean(times) if times else array([])
+
+
 def extract_field(partition, field_name):
     # The count of elements may be different. We want *all* of them.
     lhs = [x[field_name] for x in partition[0]]
@@ -174,6 +200,7 @@
 
     return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
 
+
 def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
     def get_utest_color(pval):
         return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
@@ -242,7 +269,8 @@
         if utest:
             timings_cpu = extract_field(partition, 'cpu_time')
             timings_time = extract_field(partition, 'real_time')
-            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
+                timings_cpu, timings_time)
             if cpu_pvalue and time_pvalue:
                 utest_results = {
                     'have_optimal_repetitions': have_optimal_repetitions,
@@ -268,6 +296,25 @@
                 'utest': utest_results
             })
 
+    lhs_gmean = calculate_geomean(json1)
+    rhs_gmean = calculate_geomean(json2)
+    if lhs_gmean.any() and rhs_gmean.any():
+        diff_report.append({
+            'name': 'OVERALL_GEOMEAN',
+            'measurements': [{
+                'real_time': lhs_gmean[0],
+                'cpu_time': lhs_gmean[1],
+                'real_time_other': rhs_gmean[0],
+                'cpu_time_other': rhs_gmean[1],
+                'time': calculate_change(lhs_gmean[0], rhs_gmean[0]),
+                'cpu': calculate_change(lhs_gmean[1], rhs_gmean[1])
+            }],
+            'time_unit': 's',
+            'run_type': 'aggregate',
+            'aggregate_name': 'geomean',
+            'utest': {}
+        })
+
     return diff_report
 
 
@@ -307,19 +354,19 @@
         if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
             for measurement in benchmark['measurements']:
                 output_strs += [color_format(use_color,
-                                            fmt_str,
-                                            BC_HEADER,
-                                            benchmark['name'],
-                                            first_col_width,
-                                            get_color(measurement['time']),
-                                            measurement['time'],
-                                            get_color(measurement['cpu']),
-                                            measurement['cpu'],
-                                            measurement['real_time'],
-                                            measurement['real_time_other'],
-                                            measurement['cpu_time'],
-                                            measurement['cpu_time_other'],
-                                            endc=BC_ENDC)]
+                                             fmt_str,
+                                             BC_HEADER,
+                                             benchmark['name'],
+                                             first_col_width,
+                                             get_color(measurement['time']),
+                                             measurement['time'],
+                                             get_color(measurement['cpu']),
+                                             measurement['cpu'],
+                                             measurement['real_time'],
+                                             measurement['real_time_other'],
+                                             measurement['cpu_time'],
+                                             measurement['cpu_time_other'],
+                                             endc=BC_ENDC)]
 
         # After processing the measurements, if requested and
         # if applicable (e.g. u-test exists for given benchmark),
@@ -403,6 +450,7 @@
                 '-0.1000', '100', '110', '100', '90'],
             ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
             ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+            ['OVERALL_GEOMEAN', '-0.8344', '-0.8026', '0', '0', '0', '0']
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report, use_color=False)
@@ -489,6 +537,15 @@
                 'time_unit': 's',
                 'utest': {}
             },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 1.193776641714438e-06, 'cpu_time': 1.2144445585302297e-06,
+                                  'real_time_other': 1.9768988699420897e-07, 'cpu_time_other': 2.397447755209533e-07,
+                                  'time': -0.834399601997324, 'cpu': -0.8025889499549471}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean', 'utest': {}
+            },
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
         for out, expected in zip(
@@ -524,6 +581,7 @@
             ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
             ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
             ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+            ['OVERALL_GEOMEAN', '-0.5000', '-0.5000', '0', '0', '0', '0']
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report, use_color=False)
@@ -561,6 +619,16 @@
                 'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
                 'time_unit': 'ns',
                 'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 2.213363839400641e-08, 'cpu_time': 2.213363839400641e-08,
+                                  'real_time_other': 1.1066819197003185e-08, 'cpu_time_other': 1.1066819197003185e-08,
+                                  'time': -0.5000000000000009, 'cpu': -0.5000000000000009}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
             }
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
@@ -599,8 +667,8 @@
             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
             ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
             ['BM_Two_pvalue',
-             '0.6985',
-             '0.6985',
+             '1.0000',
+             '0.6667',
              'U',
              'Test,',
              'Repetitions:',
@@ -617,7 +685,7 @@
             ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
             ['short_pvalue',
              '0.7671',
-             '0.1489',
+             '0.2000',
              'U',
              'Test,',
              'Repetitions:',
@@ -631,6 +699,7 @@
              'repetitions',
              'recommended.'],
             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
@@ -646,8 +715,8 @@
         expect_lines = [
             ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
             ['BM_Two_pvalue',
-             '0.6985',
-             '0.6985',
+             '1.0000',
+             '0.6667',
              'U',
              'Test,',
              'Repetitions:',
@@ -664,7 +733,7 @@
             ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
             ['short_pvalue',
              '0.7671',
-             '0.1489',
+             '0.2000',
              'U',
              'Test,',
              'Repetitions:',
@@ -677,6 +746,7 @@
              '9+',
              'repetitions',
              'recommended.'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
@@ -717,7 +787,7 @@
                 ],
                 'time_unit': 'ns',
                 'utest': {
-                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
                 }
             },
             {
@@ -738,7 +808,7 @@
                 ],
                 'time_unit': 'ns',
                 'utest': {
-                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
                 }
             },
             {
@@ -753,6 +823,16 @@
                 ],
                 'time_unit': 'ns',
                 'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
             }
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
@@ -792,8 +872,8 @@
             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
             ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
             ['BM_Two_pvalue',
-             '0.6985',
-             '0.6985',
+             '1.0000',
+             '0.6667',
              'U',
              'Test,',
              'Repetitions:',
@@ -810,7 +890,7 @@
             ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
             ['short_pvalue',
              '0.7671',
-             '0.1489',
+             '0.2000',
              'U',
              'Test,',
              'Repetitions:',
@@ -823,7 +903,8 @@
              '9+',
              'repetitions',
              'recommended.'],
-             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53']
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
         ]
         output_lines_with_header = print_difference_report(
             self.json_diff_report,
@@ -865,7 +946,7 @@
                 ],
                 'time_unit': 'ns',
                 'utest': {
-                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
                 }
             },
             {
@@ -886,7 +967,7 @@
                 ],
                 'time_unit': 'ns',
                 'utest': {
-                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
                 }
             },
             {
@@ -898,11 +979,83 @@
                      'real_time': 8,
                      'cpu_time_other': 53,
                      'cpu': -0.3375
-                    }
+                     }
                 ],
                 'utest': {},
                 'time_unit': u'ns',
                 'aggregate_name': ''
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceForPercentageAggregates(
+        unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test4_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test4_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['whocares', '-0.5000', '+0.5000', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'whocares',
+                'measurements': [
+                    {'time': -0.5,
+                     'cpu': 0.5,
+                     'real_time': 0.01,
+                     'real_time_other': 0.005,
+                     'cpu_time': 0.10,
+                     'cpu_time_other': 0.15}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
             }
         ]
         self.assertEqual(len(self.json_diff_report), len(expected_output))
diff --git a/third-party/update_benchmark.sh b/third-party/update_benchmark.sh
new file mode 100755
--- /dev/null
+++ b/third-party/update_benchmark.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+echo "This script deletes `benchmark`, clones it from github, together"
+echo "with its dependencies. It then removes .git* files and dirs."
+echo "NOTE!!!"
+echo "Please double-check the benchmark github wiki for any changes"
+echo "to dependencies. Currently, these are limited to googletest."
+echo
+read -p "Press a key to continue, or Ctrl+C to cancel"
+
+rm -rf benchmark
+git clone https://github.com/google/benchmark.git
+rm -rf benchmark/.git*
+