Index: CMakeLists.txt
===================================================================
--- CMakeLists.txt
+++ CMakeLists.txt
@@ -56,6 +56,8 @@
 option(LIBCXX_ENABLE_FILESYSTEM
         "Build filesystem as part of libc++experimental.a" ${LIBCXX_ENABLE_EXPERIMENTAL_LIBRARY})
 option(LIBCXX_INCLUDE_TESTS "Build the libc++ tests." ${LLVM_INCLUDE_TESTS})
+option(LIBCXX_INCLUDE_BENCHMARKS "Build the libc++ benchmarks and their dependancies" OFF)
+option(LIBCXX_BUILD_BENCHMARK_NATIVE_STDLIB "Build the benchmarks against the native STL" OFF)
 option(LIBCXX_INCLUDE_DOCS "Build the libc++ documentation." ${LLVM_INCLUDE_DOCS})
 set(LIBCXX_LIBDIR_SUFFIX "${LLVM_LIBDIR_SUFFIX}" CACHE STRING
     "Define suffix of library directory name (32/64)")
@@ -426,6 +428,9 @@
 add_subdirectory(include)
 add_subdirectory(lib)
 
+if (LIBCXX_INCLUDE_BENCHMARKS)
+  add_subdirectory(benchmarks)
+endif()
 if (LIBCXX_INCLUDE_TESTS)
   add_subdirectory(test)
 endif()
Index: benchmarks/CMakeLists.txt
===================================================================
--- /dev/null
+++ benchmarks/CMakeLists.txt
@@ -0,0 +1,119 @@
+include(ExternalProject)
+include(CheckCXXCompilerFlag)
+
+#==============================================================================
+# Build Google Benchmark for libc++
+#==============================================================================
+check_cxx_compiler_flag(-stdlib=libc++ LIBCXX_HAS_NO_STDLIB_LIBCXX_FLAG)
+if (NOT LIBCXX_HAS_NO_STDLIB_LIBCXX_FLAG)
+  message(FATAL "Benchmark requires support for the -stdlib=libc++ flag")
+endif()
+
+set(BENCHMARK_LIBCXX_COMPILE_FLAGS
+    -Wno-unused-command-line-argument
+    -nostdinc++
+    -cxx-isystem ${LIBCXX_SOURCE_DIR}/include
+    -stdlib=libc++)
+set(BENCHMARK_LIBCXX_LINK_FLAGS
+    -L${LIBCXX_LIBRARY_DIR}
+    -Wl,-rpath,${LIBCXX_LIBRARY_DIR})
+split_list(BENCHMARK_LIBCXX_COMPILE_FLAGS)
+split_list(BENCHMARK_LIBCXX_LINK_FLAGS)
+
+ExternalProject_Add(google-benchmark-libcxx
+        EXCLUDE_FROM_ALL ON
+        DEPENDS cxx
+        PREFIX benchmark-libcxx
+        SOURCE_DIR ${LIBCXX_SOURCE_DIR}/utils/google-benchmark
+        INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/benchmark-libcxx
+        CMAKE_CACHE_DEFAULT_ARGS
+          -DCMAKE_BUILD_TYPE:STRING=RELEASE
+          -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+          -DCMAKE_CXX_FLAGS:STRING=${BENCHMARK_LIBCXX_COMPILE_FLAGS}
+          -DCMAKE_SHARED_LINK_FLAGS:STRING=${BENCHMARK_LIBCXX_LINK_FLAGS}
+          -DCMAKE_EXE_LINK_FLAGS:STRING=${BENCHMARK_LIBCXX_LINK_FLAGS}
+          -DBENCHMARK_ENABLE_TESTING:BOOL=OFF)
+
+#==============================================================================
+# Build Google Benchmark for the native stdlib
+#==============================================================================
+if (LIBCXX_BUILD_BENCHMARK_NATIVE_STDLIB)
+  ExternalProject_Add(google-benchmark-native
+        EXCLUDE_FROM_ALL ON
+        PREFIX benchmark-native
+        SOURCE_DIR ${LIBCXX_SOURCE_DIR}/utils/google-benchmark
+        INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/benchmark-native
+        CMAKE_CACHE_ARGS
+          -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+        CMAKE_CACHE_DEFAULT_ARGS
+          -DCMAKE_BUILD_TYPE:STRING=RELEASE
+          -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>)
+endif()
+
+#==============================================================================
+# Benchmark tests configuration
+#==============================================================================
+add_custom_target(libcxx-benchmarks)
+
+set(BENCHMARK_LIBCXX_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/benchmark-libcxx)
+set(BENCHMARK_NATIVE_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/benchmark-native)
+set(BENCHMARK_TEST_COMPILE_FLAGS
+    -std=c++14 -O2
+    -I${BENCHMARK_LIBCXX_INSTALL}/include
+)
+set(BENCHMARK_TEST_LIBCXX_COMPILE_FLAGS
+    -nostdinc++
+    ${BENCHMARK_TEST_COMPILE_FLAGS}
+    -Wno-user-defined-literals
+)
+set(BENCHMARK_TEST_LIBCXX_LINK_FLAGS
+    -nodefaultlibs
+    -L${BENCHMARK_LIBCXX_INSTALL}/lib/
+)
+set(BENCHMARK_TEST_NATIVE_LINK_FLAGS
+    -L${BENCHMARK_NATIVE_INSTALL}/lib/
+)
+split_list(BENCHMARK_TEST_COMPILE_FLAGS)
+split_list(BENCHMARK_TEST_LIBCXX_COMPILE_FLAGS)
+split_list(BENCHMARK_TEST_LIBCXX_LINK_FLAGS)
+split_list(BENCHMARK_TEST_NATIVE_LINK_FLAGS)
+macro(add_benchmark_test name source_file)
+  set(libcxx_target ${name}_libcxx)
+  add_executable(${libcxx_target} EXCLUDE_FROM_ALL ${source_file})
+  add_dependencies(${libcxx_target} cxx google-benchmark-libcxx)
+  add_dependencies(libcxx-benchmarks ${libcxx_target})
+  target_link_libraries(${libcxx_target} cxx -lbenchmark)
+  set_target_properties(${libcxx_target}
+    PROPERTIES
+          OUTPUT_NAME "${name}.libcxx.out"
+          COMPILE_FLAGS "${BENCHMARK_TEST_LIBCXX_COMPILE_FLAGS}"
+          LINK_FLAGS "${BENCHMARK_TEST_LIBCXX_LINK_FLAGS}")
+  if (LIBCXX_BUILD_BENCHMARK_NATIVE_STDLIB)
+    set(native_target ${name}_native)
+    add_executable(${native_target} EXCLUDE_FROM_ALL ${source_file})
+    add_dependencies(${native_target} google-benchmark-native)
+    target_link_libraries(${native_target} -lbenchmark)
+    if (LIBCXX_HAS_PTHREAD_LIB)
+      target_link_libraries(${native_target} -pthread)
+    endif()
+    add_dependencies(libcxx-benchmarks ${native_target})
+    set_target_properties(${native_target}
+      PROPERTIES
+          OUTPUT_NAME "${name}.native.out"
+          INCLUDE_DIRECTORIES ""
+          COMPILE_FLAGS "${BENCHMARK_TEST_COMPILE_FLAGS}"
+          LINK_FLAGS "${BENCHMARK_TEST_NATIVE_LINK_FLAGS}")
+  endif()
+endmacro()
+
+
+#==============================================================================
+# Register Benchmark tests
+#==============================================================================
+file(GLOB BENCHMARK_TESTS "*.bench.cpp")
+foreach(test_path ${BENCHMARK_TESTS})
+  get_filename_component(test_file "${test_path}" NAME)
+  message(STATUS "TEST: ${test_file}")
+  string(REPLACE ".bench.cpp" "" test_name "${test_file}")
+  add_benchmark_test(${test_name} ${test_file})
+endforeach()
Index: benchmarks/ContainerBenchmarks.hpp
===================================================================
--- /dev/null
+++ benchmarks/ContainerBenchmarks.hpp
@@ -0,0 +1,69 @@
+#ifndef BENCHMARK_CONTAINER_BENCHMARKS_HPP
+#define BENCHMARK_CONTAINER_BENCHMARKS_HPP
+
+#include <cassert>
+
+#include "benchmark/benchmark_api.h"
+
+namespace ContainerBenchmarks {
+
+template <class Container, class GenInputs>
+void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) {
+    auto in = gen(st.range_x());
+    const auto end = in.end();
+    while (st.KeepRunning()) {
+        c.clear();
+        for (auto it = in.begin(); it != end; ++it) {
+            benchmark::DoNotOptimize(&(*c.insert(*it).first));
+        }
+        benchmark::ClobberMemory();
+    }
+}
+
+template <class Container, class GenInputs>
+void BM_InsertValueRehash(benchmark::State& st, Container c, GenInputs gen) {
+    auto in = gen(st.range_x());
+    const auto end = in.end();
+    while (st.KeepRunning()) {
+        c.clear();
+        c.rehash(16);
+        for (auto it = in.begin(); it != end; ++it) {
+            benchmark::DoNotOptimize(&(*c.insert(*it).first));
+        }
+        benchmark::ClobberMemory();
+    }
+}
+
+template <class Container, class GenInputs>
+static void BM_Find(benchmark::State& st, Container c, GenInputs gen) {
+    auto in = gen(st.range_x());
+    c.insert(in.begin(), in.end());
+    benchmark::DoNotOptimize(&(*c.begin()));
+    const auto end = in.data() + in.size();
+    while (st.KeepRunning()) {
+        for (auto it = in.data(); it != end; ++it) {
+            benchmark::DoNotOptimize(&(*c.find(*it)));
+        }
+        benchmark::ClobberMemory();
+    }
+}
+
+template <class Container, class GenInputs>
+static void BM_FindRehash(benchmark::State& st, Container c, GenInputs gen) {
+    c.rehash(8);
+    auto in = gen(st.range_x());
+    c.insert(in.begin(), in.end());
+    benchmark::DoNotOptimize(&(*c.begin()));
+    const auto end = in.data() + in.size();
+    while (st.KeepRunning()) {
+        for (auto it = in.data(); it != end; ++it) {
+            benchmark::DoNotOptimize(&(*c.find(*it)));
+        }
+        benchmark::ClobberMemory();
+    }
+
+}
+
+} // end namespace ContainerBenchmarks
+
+#endif // BENCHMARK_CONTAINER_BENCHMARKS_HPP
Index: benchmarks/GenerateInput.hpp
===================================================================
--- /dev/null
+++ benchmarks/GenerateInput.hpp
@@ -0,0 +1,133 @@
+#ifndef BENCHMARK_GENERATE_INPUT_HPP
+#define BENCHMARK_GENERATE_INPUT_HPP
+
+#include <algorithm>
+#include <random>
+#include <vector>
+#include <string>
+#include <climits>
+#include <cstddef>
+
+static const char Letters[] = {
+    '0','1','2','3','4',
+    '5','6','7','8','9',
+    'A','B','C','D','E','F',
+    'G','H','I','J','K',
+    'L','M','N','O','P',
+    'Q','R','S','T','U',
+    'V','W','X','Y','Z',
+    'a','b','c','d','e','f',
+    'g','h','i','j','k',
+    'l','m','n','o','p',
+    'q','r','s','t','u',
+    'v','w','x','y','z'
+};
+static const std::size_t LettersSize = sizeof(Letters);
+
+inline std::default_random_engine& getRandomEngine() {
+    static std::default_random_engine RandEngine(std::random_device{}());
+    return RandEngine;
+}
+
+inline char getRandomChar() {
+    std::uniform_int_distribution<> LettersDist(0, LettersSize-1);
+    return Letters[LettersDist(getRandomEngine())];
+}
+
+template <class IntT>
+inline IntT getRandomInteger() {
+    std::uniform_int_distribution<IntT> dist;
+    return dist(getRandomEngine());
+}
+
+inline std::string getRandomString(std::size_t Len) {
+    std::string str(Len, 0);
+    std::generate_n(str.begin(), Len, &getRandomChar);
+    return str;
+}
+
+template <class IntT>
+inline std::vector<IntT> getDuplicateIntegerInputs(size_t N) {
+    std::vector<IntT> inputs(N, static_cast<IntT>(-1));
+    return inputs;
+}
+
+template <class IntT>
+inline std::vector<IntT> getSortedIntegerInputs(size_t N) {
+    std::vector<IntT> inputs;
+    for (size_t i=0; i < N; i += 1)
+        inputs.push_back(i);
+    return inputs;
+}
+
+template <class IntT>
+std::vector<IntT> getSortedLargeIntegerInputs(size_t N) {
+    std::vector<IntT> inputs;
+    for (size_t i=0; i < N; ++i) {
+        inputs.push_back(i + N);
+    }
+    return inputs;
+}
+
+template <class IntT>
+std::vector<IntT> getSortedTopBitsIntegerInputs(size_t N) {
+    std::vector<IntT> inputs = getSortedIntegerInputs<IntT>(N);
+    for (auto& E : inputs) E <<= ((sizeof(IntT) / 2) * CHAR_BIT);
+    return inputs;
+}
+
+template <class IntT>
+inline std::vector<IntT> getReverseSortedIntegerInputs(size_t N) {
+    std::vector<IntT> inputs;
+    std::size_t i = N;
+    while (i > 0) {
+        --i;
+        inputs.push_back(i);
+    }
+    return inputs;
+}
+
+template <class IntT>
+std::vector<IntT> getPipeOrganIntegerInputs(size_t N) {
+    std::vector<IntT> v; v.reserve(N);
+    for (size_t i = 0; i < N/2; ++i) v.push_back(i);
+    for (size_t i = N/2; i < N; ++i) v.push_back(N - i);
+    return v;
+}
+
+
+template <class IntT>
+std::vector<IntT> getRandomIntegerInputs(size_t N) {
+    std::vector<IntT> inputs;
+    for (size_t i=0; i < N; ++i) {
+        inputs.push_back(getRandomInteger<IntT>());
+    }
+    return inputs;
+}
+
+inline std::vector<std::string> getDuplicateStringInputs(size_t N) {
+    std::vector<std::string> inputs(N, getRandomString(1024));
+    return inputs;
+}
+
+inline std::vector<std::string> getRandomStringInputs(size_t N) {
+    std::vector<std::string> inputs;
+    for (int i=0; i < N; ++i) {
+        inputs.push_back(getRandomString(1024));
+    }
+    return inputs;
+}
+
+inline std::vector<std::string> getSortedStringInputs(size_t N) {
+    std::vector<std::string> inputs = getRandomStringInputs(N);
+    std::sort(inputs.begin(), inputs.end());
+    return inputs;
+}
+
+inline std::vector<std::string> getReverseSortedStringInputs(size_t N) {
+    std::vector<std::string> inputs = getSortedStringInputs(N);
+    std::reverse(inputs.begin(), inputs.end());
+    return inputs;
+}
+
+#endif // BENCHMARK_GENERATE_INPUT_HPP
Index: benchmarks/algorithms.bench.cpp
===================================================================
--- /dev/null
+++ benchmarks/algorithms.bench.cpp
@@ -0,0 +1,62 @@
+#include <unordered_set>
+#include <vector>
+#include <cstdint>
+
+#include "benchmark/benchmark_api.h"
+#include "GenerateInput.hpp"
+
+constexpr std::size_t TestNumInputs = 1024;
+
+template <class GenInputs>
+void BM_Sort(benchmark::State& st, GenInputs gen) {
+    using ValueType = typename decltype(gen(0))::value_type;
+    const auto in = gen(st.range_x());
+    std::vector<ValueType> inputs[5];
+    auto reset_inputs = [&]() {
+        for (auto& C : inputs) {
+            C = in;
+            benchmark::DoNotOptimize(C.data());
+        }
+    };
+    reset_inputs();
+    while (st.KeepRunning()) {
+        for (auto& I : inputs) {
+            std::sort(I.data(), I.data() + I.size());
+            benchmark::DoNotOptimize(I.data());
+        }
+        st.PauseTiming();
+        reset_inputs();
+        benchmark::ClobberMemory();
+        st.ResumeTiming();
+    }
+}
+
+BENCHMARK_CAPTURE(BM_Sort, random_uint32,
+    getRandomIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Sort, sorted_ascending_uint32,
+    getSortedIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Sort, sorted_descending_uint32,
+    getReverseSortedIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Sort, single_element_uint32,
+    getDuplicateIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Sort, pipe_organ_uint32,
+    getPipeOrganIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Sort, random_strings,
+    getRandomStringInputs)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Sort, sorted_ascending_strings,
+    getSortedStringInputs)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Sort, sorted_descending_strings,
+    getReverseSortedStringInputs)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Sort, single_element_strings,
+    getDuplicateStringInputs)->Arg(TestNumInputs);
+
+
+BENCHMARK_MAIN()
Index: benchmarks/unordered_set_operations.bench.cpp
===================================================================
--- benchmarks/unordered_set_operations.bench.cpp
+++ benchmarks/unordered_set_operations.bench.cpp
@@ -1,44 +1,268 @@
 #include <unordered_set>
 #include <vector>
+#include <functional>
 #include <cstdint>
+#include <cstdlib>
+#include <cstring>
 
 #include "benchmark/benchmark_api.h"
 
-template <class IntT>
-std::vector<IntT> getInputs(size_t N) {
-    std::vector<IntT> inputs;
-    for (size_t i=0; i < N; ++i) {
-        inputs.push_back(i);
-    }
-    return inputs;
+#include "ContainerBenchmarks.hpp"
+#include "GenerateInput.hpp"
+
+using namespace ContainerBenchmarks;
+
+constexpr std::size_t TestNumInputs = 1024;
+
+template <class _Size>
+inline __attribute__((__always_inline__))
+_Size loadword(const void* __p) {
+    _Size __r;
+    std::memcpy(&__r, __p, sizeof(__r));
+    return __r;
 }
 
-template <class Container, class Inputs>
-void BM_SetInsert(benchmark::State& st, Container c, Inputs const& in) {
-    const auto end = in.end();
-    while (st.KeepRunning()) {
-        c.clear();
-        for (auto it = in.begin(); it != end; ++it) {
-            benchmark::DoNotOptimize(c.insert(*it));
-        }
-        benchmark::DoNotOptimize(c);
-    }
+inline __attribute__((__always_inline__))
+std::size_t rotate_by_at_least_1(std::size_t __val, int __shift) {
+    return (__val >> __shift) | (__val << (64 - __shift));
+}
+
+inline __attribute__((__always_inline__))
+std::size_t hash_len_16(std::size_t __u, std::size_t __v) {
+    const  std::size_t __mul = 0x9ddfea08eb382d69ULL;
+    std::size_t __a = (__u ^ __v) * __mul;
+    __a ^= (__a >> 47);
+    std::size_t __b = (__v ^ __a) * __mul;
+    __b ^= (__b >> 47);
+    __b *= __mul;
+    return __b;
+}
+
+
+template <std::size_t _Len>
+inline __attribute__((__always_inline__))
+std::size_t hash_len_0_to_8(const char* __s) {
+    static_assert(_Len == 4 || _Len == 8, "");
+    const uint64_t __a = loadword<uint32_t>(__s);
+    const uint64_t __b = loadword<uint32_t>(__s + _Len - 4);
+    return hash_len_16(_Len + (__a << 3), __b);
 }
-BENCHMARK_CAPTURE(BM_SetInsert, uint32_insert,
-    std::unordered_set<uint32_t>{}, getInputs<uint32_t>(1024));
 
-template <class Container, class Inputs>
-void BM_SetFind(benchmark::State& st, Container c, Inputs const& in) {
-    c.insert(in.begin(), in.end());
-    const auto end = in.end();
+struct UInt32Hash {
+  UInt32Hash() = default;
+  inline __attribute__((__always_inline__))
+  std::size_t operator()(uint32_t data) const {
+      return hash_len_0_to_8<4>(reinterpret_cast<const char*>(&data));
+  }
+};
+
+struct UInt64Hash {
+  UInt64Hash() = default;
+  inline __attribute__((__always_inline__))
+  std::size_t operator()(uint64_t data) const {
+      return hash_len_0_to_8<8>(reinterpret_cast<const char*>(&data));
+  }
+};
+
+struct UInt128Hash {
+  UInt128Hash() = default;
+  inline __attribute__((__always_inline__))
+  std::size_t operator()(__uint128_t data) const {
+      const __uint128_t __mask = static_cast<std::size_t>(-1);
+      const std::size_t __a = (std::size_t)(data & __mask);
+      const std::size_t __b = (std::size_t)((data & (__mask << 64)) >> 64);
+      return hash_len_16(__a, rotate_by_at_least_1(__b + 16, 16)) ^ __b;
+  }
+};
+
+struct UInt32Hash2 {
+  UInt32Hash2() = default;
+  inline __attribute__((__always_inline__))
+  std::size_t operator()(uint32_t data) const {
+      const uint32_t __m = 0x5bd1e995;
+      const uint32_t __r = 24;
+      uint32_t __h = 4;
+      uint32_t __k = data;
+        __k *= __m;
+        __k ^= __k >> __r;
+        __k *= __m;
+        __h *= __m;
+        __h ^= __k;
+        __h ^= __h >> 13;
+        __h *= __m;
+        __h ^= __h >> 15;
+    return __h;
+  }
+};
+
+struct UInt64Hash2 {
+  UInt64Hash2() = default;
+  inline __attribute__((__always_inline__))
+  std::size_t operator()(uint64_t data) const {
+      return hash_len_0_to_8<8>(reinterpret_cast<const char*>(&data));
+  }
+};
+
+//----------------------------------------------------------------------------//
+//                               BM_Hash
+// ---------------------------------------------------------------------------//
+
+template <class HashFn, class GenInputs>
+void BM_Hash(benchmark::State& st, HashFn fn, GenInputs gen) {
+    auto in = gen(st.range_x());
+    const auto end = in.data() + in.size();
+    std::size_t last_hash = 0;
+    benchmark::DoNotOptimize(&last_hash);
     while (st.KeepRunning()) {
-        for (auto it = in.begin(); it != end; ++it) {
-            benchmark::DoNotOptimize(c.find(*it));
+        for (auto it = in.data(); it != end; ++it) {
+            benchmark::DoNotOptimize(last_hash += fn(*it));
         }
+        benchmark::ClobberMemory();
     }
 }
-BENCHMARK_CAPTURE(BM_SetFind, uint32_lookup,
-    std::unordered_set<uint32_t>{}, getInputs<uint32_t>(1024));
 
+BENCHMARK_CAPTURE(BM_Hash,
+    uint32_random_std_hash,
+    std::hash<uint32_t>{},
+    getRandomIntegerInputs<uint32_t>) -> Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Hash,
+    uint32_random_custom_hash,
+    UInt32Hash{},
+    getRandomIntegerInputs<uint32_t>) -> Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Hash,
+    uint32_top_std_hash,
+    std::hash<uint32_t>{},
+    getSortedTopBitsIntegerInputs<uint32_t>) -> Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_Hash,
+    uint32_top_custom_hash,
+    UInt32Hash{},
+    getSortedTopBitsIntegerInputs<uint32_t>) -> Arg(TestNumInputs);
+
+
+//----------------------------------------------------------------------------//
+//                       BM_InsertValue
+// ---------------------------------------------------------------------------//
+
+
+// Sorted Assending // 
+BENCHMARK_CAPTURE(BM_InsertValue,
+    unordered_set_uint32,
+    std::unordered_set<uint32_t>{},
+    getRandomIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_InsertValue,
+    unordered_set_uint32_sorted,
+    std::unordered_set<uint32_t>{},
+    getSortedIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+// Top Bytes // 
+BENCHMARK_CAPTURE(BM_InsertValue,
+    unordered_set_top_bits_uint32,
+    std::unordered_set<uint32_t>{},
+    getSortedTopBitsIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_InsertValueRehash,
+    unordered_set_top_bits_uint32,
+    std::unordered_set<uint32_t, UInt32Hash>{},
+    getSortedTopBitsIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+// String //
+BENCHMARK_CAPTURE(BM_InsertValue,
+    unordered_set_string,
+    std::unordered_set<std::string>{},
+    getRandomStringInputs)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_InsertValueRehash,
+    unordered_set_string,
+    std::unordered_set<std::string>{},
+    getRandomStringInputs)->Arg(TestNumInputs);
+
+//----------------------------------------------------------------------------//
+//                         BM_Find
+// ---------------------------------------------------------------------------//
+
+// Random //
+BENCHMARK_CAPTURE(BM_Find,
+    unordered_set_random_uint64,
+    std::unordered_set<uint64_t>{},
+    getRandomIntegerInputs<uint64_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_FindRehash,
+    unordered_set_random_uint64,
+    std::unordered_set<uint64_t, UInt64Hash>{},
+    getRandomIntegerInputs<uint64_t>)->Arg(TestNumInputs);
+
+// Sorted //
+BENCHMARK_CAPTURE(BM_Find,
+    unordered_set_sorted_uint64,
+    std::unordered_set<uint64_t>{},
+    getSortedIntegerInputs<uint64_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_FindRehash,
+    unordered_set_sorted_uint64,
+    std::unordered_set<uint64_t, UInt64Hash>{},
+    getSortedIntegerInputs<uint64_t>)->Arg(TestNumInputs);
+
+
+// Sorted //
+#if 1
+BENCHMARK_CAPTURE(BM_Find,
+    unordered_set_sorted_uint128,
+    std::unordered_set<__uint128_t, UInt128Hash>{},
+    getSortedTopBitsIntegerInputs<__uint128_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_FindRehash,
+    unordered_set_sorted_uint128,
+    std::unordered_set<__uint128_t, UInt128Hash>{},
+    getSortedTopBitsIntegerInputs<__uint128_t>)->Arg(TestNumInputs);
+#endif
+
+// Sorted //
+BENCHMARK_CAPTURE(BM_Find,
+    unordered_set_sorted_uint32,
+    std::unordered_set<uint32_t>{},
+    getSortedIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_FindRehash,
+    unordered_set_sorted_uint32,
+    std::unordered_set<uint32_t, UInt32Hash2>{},
+    getSortedIntegerInputs<uint32_t>)->Arg(TestNumInputs);
+
+// Sorted Ascending //
+BENCHMARK_CAPTURE(BM_Find,
+    unordered_set_sorted_large_uint64,
+    std::unordered_set<uint64_t>{},
+    getSortedLargeIntegerInputs<uint64_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_FindRehash,
+    unordered_set_sorted_large_uint64,
+    std::unordered_set<uint64_t, UInt64Hash>{},
+    getSortedLargeIntegerInputs<uint64_t>)->Arg(TestNumInputs);
+
+
+// Top Bits //
+BENCHMARK_CAPTURE(BM_Find,
+    unordered_set_top_bits_uint64,
+    std::unordered_set<uint64_t>{},
+    getSortedTopBitsIntegerInputs<uint64_t>)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_FindRehash,
+    unordered_set_top_bits_uint64,
+    std::unordered_set<uint64_t, UInt64Hash>{},
+    getSortedTopBitsIntegerInputs<uint64_t>)->Arg(TestNumInputs);
+
+// String //
+BENCHMARK_CAPTURE(BM_Find,
+    unordered_set_string,
+    std::unordered_set<std::string>{},
+    getRandomStringInputs)->Arg(TestNumInputs);
+
+BENCHMARK_CAPTURE(BM_FindRehash,
+    unordered_set_string,
+    std::unordered_set<std::string>{},
+    getRandomStringInputs)->Arg(TestNumInputs);
 
 BENCHMARK_MAIN()
Index: docs/BuildingLibcxx.rst
===================================================================
--- docs/BuildingLibcxx.rst
+++ docs/BuildingLibcxx.rst
@@ -227,7 +227,7 @@
   libc++abi is the C++ ABI library used.
 
 
-libc++ Feature options
+libc++ Feature Options
 ----------------------
 
 .. option:: LIBCXX_ENABLE_EXCEPTIONS:BOOL
@@ -242,9 +242,25 @@
 
   Build libc++ with run time type information.
 
+.. option:: LIBCXX_INCLUDE_BENCHMARKS:BOOL
 
-libc++ Feature options
-----------------------
+  **Default**: ``OFF``
+
+  Build the libc++ benchmark tests and the Google Benchmark library needed
+  to support them.
+
+.. option:: LIBCXX_BUILD_BENCHMARK_NATIVE_STDLIB:BOOL
+
+  **Default**:: ``OFF``
+
+  Build the libc++ benchmark tests and Google Benchmark library against the
+  native standard library on the platform. On linux this can be used to compare
+  libc++ to libstdc++ by building the benchmark tests against both standard
+  libraries.
+
+
+libc++ ABI Feature Options
+--------------------------
 
 The following options allow building libc++ for a different ABI version.
 
Index: docs/TestingLibcxx.rst
===================================================================
--- docs/TestingLibcxx.rst
+++ docs/TestingLibcxx.rst
@@ -198,3 +198,62 @@
   If ``LIBCXX_COLOR_DIAGNOSTICS`` is defined then the test suite will attempt
   to use color diagnostic outputs from the compiler.
   Also see :option:`color_diagnostics`.
+
+Benchmarks
+==========
+
+Libc++ contains benchmark tests separately from the test of the test suite.
+The benchmarks are written using the `Google Benchmark`_ library, a copy of which
+is stored in the libc++ repository.
+
+For more information about using the Google Benchmark library see the
+`official documentation <https://github.com/google/benchmark>`_.
+
+.. _`Google Benchmark`: https://github.com/google/benchmark
+
+Building Benchmarks
+-------------------
+
+The benchmark tests are not enabled by default. To build the benchmarks
+libc++ must be configured using the CMake option ``-DLIBCXX_INCLUDE_BENCHMARKS=ON``.
+Then the benchmarks can be built using the ``libcxx-benchmarks`` target.
+
+An example build would look like:
+
+.. code-block:: bash
+
+  $ cd build
+  $ cmake [options] -DLIBCXX_INCLUDE_BENCHMARKS=ON <path to libcxx sources>
+  $ make libcxx-benchmarks
+
+This will build all of the benchmarks under ``<libcxx-src>/benchmarks`` to be
+built against the just-built libc++. The compiled tests are output into
+``build/benchmarks``.
+
+The benchmarks can also be built against the platforms native standard library
+using the ``-DLIBCXX_BUILD_BENCHMARKS_NATIVE_STDLIB=ON`` CMake option. This
+is useful for comparing the performance of libc++ to other standard libraries.
+The compiled benchmarks are named ``<test>.libcxx.out`` if they test libc++ and
+``<test>.native.out`` otherwise.
+
+Also See:
+
+  * :ref:`Building Libc++ <build instructions>`
+  * :ref:`CMake Options`
+
+Running Benchmarks
+------------------
+
+The benchmarks must be run manually by the user. Currently there is no way
+to run them as part of the build.
+
+For example:
+
+.. code-block:: bash
+
+  $ cd build/benchmarks
+  $ make libcxx-benchmarks
+  $ ./algorithms.libcxx.out # Runs all the benchmarks
+  $ ./algorithms.libcxx.out --benchmark_filter=BM_Sort.* # Only runs the sort benchmarks
+
+For more information about running benchmarks see `Google Benchmark`_.
Index: utils/google-benchmark/.gitignore
===================================================================
--- /dev/null
+++ utils/google-benchmark/.gitignore
@@ -0,0 +1,46 @@
+*.a
+*.so
+*.so.?*
+*.dll
+*.exe
+*.dylib
+*.cmake
+!/cmake/*.cmake
+*~
+*.pyc
+__pycache__
+
+# lcov
+*.lcov
+/lcov
+
+# cmake files.
+/Testing
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+
+# makefiles.
+Makefile
+
+# in-source build.
+bin/
+lib/
+/test/*_test
+
+# exuberant ctags.
+tags
+
+# YouCompleteMe configuration.
+.ycm_extra_conf.pyc
+
+# ninja generated files.
+.ninja_deps
+.ninja_log
+build.ninja
+install_manifest.txt
+rules.ninja
+
+# out-of-source build top-level folders.
+build/
+_build/
Index: utils/google-benchmark/AUTHORS
===================================================================
--- /dev/null
+++ utils/google-benchmark/AUTHORS
@@ -0,0 +1,33 @@
+# This is the official list of benchmark authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+#
+# Names should be added to this file as:
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+#
+# Please keep the list sorted.
+
+Albert Pretorius <pretoalb@gmail.com>
+Arne Beer <arne@twobeer.de>
+Christopher Seymour <chris.j.seymour@hotmail.com>
+David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Dominic Hamon <dma@stripysock.com>
+Eugene Zhuk <eugene.zhuk@gmail.com>
+Evgeny Safronov <division494@gmail.com>
+Felix Homann <linuxaudio@showlabor.de>
+Google Inc.
+Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+JianXiong Zhou <zhoujianxiong2@gmail.com>
+Jussi Knuuttila <jussi.knuuttila@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
+Lei Xu <eddyxu@gmail.com>
+Matt Clarkson <mattyclarkson@gmail.com>
+Oleksandr Sochka <sasha.sochka@gmail.com>
+Paul Redmond <paul.redmond@gmail.com>
+Radoslav Yovchev <radoslav.tm@gmail.com>
+Shuo Chen <chenshuo@chenshuo.com>
+Yusuke Suzuki <utatane.tea@gmail.com>
+Dirac Research 
+Zbigniew Skowron <zbychs@gmail.com>
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
Index: utils/google-benchmark/CMakeLists.txt
===================================================================
--- /dev/null
+++ utils/google-benchmark/CMakeLists.txt
@@ -0,0 +1,147 @@
+cmake_minimum_required (VERSION 2.8.11)
+project (benchmark)
+
+foreach(p
+    CMP0054 # CMake 3.1
+    CMP0056 # export EXE_LINKER_FLAGS to try_run
+    )
+  if(POLICY ${p})
+    cmake_policy(SET ${p} NEW)
+  endif()
+endforeach()
+
+option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
+option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
+# Make sure we can import out CMake functions
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+# Read the git tags to determine the project version
+include(GetGitVersion)
+get_git_version(GIT_VERSION)
+
+# Tell the user what versions we are using
+string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
+message("-- Version: ${VERSION}")
+
+# The version of the libraries
+set(GENERIC_LIB_VERSION ${VERSION})
+string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+
+# Import our CMake modules
+include(CheckCXXCompilerFlag)
+include(AddCXXCompilerFlag)
+include(CXXFeatureCheck)
+
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+  # Turn compiler warnings up to 11
+  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+
+  # Link time optimisation
+  if (BENCHMARK_ENABLE_LTO)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
+    set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "${CMAKE_STATIC_LINKER_FLAGS_RELEASE} /LTCG")
+    set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /LTCG")
+    set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG")
+
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /GL")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /GL")
+    set(CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL "${CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL} /LTCG")
+    set(CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL "${CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL} /LTCG")
+    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
+  endif()
+else()
+  # Try and enable C++11. Don't use C++14 because it doesn't work in some
+  # configurations.
+  add_cxx_compiler_flag(-std=c++11)
+  if (NOT HAVE_CXX_FLAG_STD_CXX11)
+    add_cxx_compiler_flag(-std=c++0x)
+  endif()
+
+  # Turn compiler warnings up to 11
+  add_cxx_compiler_flag(-Wall)
+
+  add_cxx_compiler_flag(-Wextra)
+  add_cxx_compiler_flag(-Wshadow)
+  add_cxx_compiler_flag(-Werror RELEASE)
+  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
+  add_cxx_compiler_flag(-Werror MINSIZEREL)
+  add_cxx_compiler_flag(-pedantic)
+  add_cxx_compiler_flag(-pedantic-errors)
+  add_cxx_compiler_flag(-Wshorten-64-to-32)
+  add_cxx_compiler_flag(-Wfloat-equal)
+  add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
+  add_cxx_compiler_flag(-fstrict-aliasing)
+  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
+    add_cxx_compiler_flag(-Wstrict-aliasing)
+  endif()
+  add_cxx_compiler_flag(-Wthread-safety)
+  if (HAVE_WTHREAD_SAFETY)
+    add_definitions(-DHAVE_WTHREAD_SAFETY)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+  endif()
+
+  # Link time optimisation
+  if (BENCHMARK_ENABLE_LTO)
+    add_cxx_compiler_flag(-flto)
+    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+      find_program(GCC_AR gcc-ar)
+      if (GCC_AR)
+        set(CMAKE_AR ${GCC_AR})
+      endif()
+      find_program(GCC_RANLIB gcc-ranlib)
+      if (GCC_RANLIB)
+        set(CMAKE_RANLIB ${GCC_RANLIB})
+      endif()
+    endif()
+  endif()
+
+  # Coverage build type
+  set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING
+    "Flags used by the C++ compiler during coverage builds."
+    FORCE)
+  set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
+    "${CMAKE_EXE_LINKER_FLAGS_DEBUG}" CACHE STRING
+    "Flags used for linking binaries during coverage builds."
+    FORCE)
+  set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
+    "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}" CACHE STRING
+    "Flags used by the shared libraries linker during coverage builds."
+    FORCE)
+  mark_as_advanced(
+    CMAKE_CXX_FLAGS_COVERAGE
+    CMAKE_EXE_LINKER_FLAGS_COVERAGE
+    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
+  set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING
+    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage."
+    FORCE)
+  add_cxx_compiler_flag(--coverage COVERAGE)
+endif()
+
+# C++ feature checks
+cxx_feature_check(STD_REGEX)
+cxx_feature_check(GNU_POSIX_REGEX)
+cxx_feature_check(POSIX_REGEX)
+cxx_feature_check(STEADY_CLOCK)
+
+# Ensure we have pthreads
+find_package(Threads REQUIRED)
+
+# Set up directories
+include_directories(${PROJECT_SOURCE_DIR}/include)
+
+# Build the targets
+add_subdirectory(src)
+
+if (BENCHMARK_ENABLE_TESTING)
+  enable_testing()
+  add_subdirectory(test)
+endif()
Index: utils/google-benchmark/CONTRIBUTING.md
===================================================================
--- /dev/null
+++ utils/google-benchmark/CONTRIBUTING.md
@@ -0,0 +1,58 @@
+# How to contribute #
+
+We'd love to accept your patches and contributions to this project.  There are
+a just a few small guidelines you need to follow.
+
+
+## Contributor License Agreement ##
+
+Contributions to any Google project must be accompanied by a Contributor
+License Agreement.  This is not a copyright **assignment**, it simply gives
+Google permission to use and redistribute your contributions as part of the
+project.
+
+  * If you are an individual writing original source code and you're sure you
+    own the intellectual property, then you'll need to sign an [individual
+    CLA][].
+
+  * If you work for a company that wants to allow you to contribute your work,
+    then you'll need to sign a [corporate CLA][].
+
+You generally only need to submit a CLA once, so if you've already submitted
+one (even if it was for a different project), you probably don't need to do it
+again.
+
+[individual CLA]: https://developers.google.com/open-source/cla/individual
+[corporate CLA]: https://developers.google.com/open-source/cla/corporate
+
+Once your CLA is submitted (or if you already submitted one for
+another Google project), make a commit adding yourself to the
+[AUTHORS][] and [CONTRIBUTORS][] files. This commit can be part
+of your first [pull request][].
+
+[AUTHORS]: AUTHORS
+[CONTRIBUTORS]: CONTRIBUTORS
+
+
+## Submitting a patch ##
+
+  1. It's generally best to start by opening a new issue describing the bug or
+     feature you're intending to fix.  Even if you think it's relatively minor,
+     it's helpful to know what people are working on.  Mention in the initial
+     issue that you are planning to work on that bug or feature so that it can
+     be assigned to you.
+
+  1. Follow the normal process of [forking][] the project, and setup a new
+     branch to work in.  It's important that each group of changes be done in
+     separate branches in order to ensure that a pull request only includes the
+     commits related to that bug or feature.
+
+  1. Do your best to have [well-formed commit messages][] for each change.
+     This provides consistency throughout the project, and ensures that commit
+     messages are able to be formatted properly by various git tools.
+
+  1. Finally, push the commits to your fork and submit a [pull request][].
+
+[forking]: https://help.github.com/articles/fork-a-repo
+[well-formed commit messages]: http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html
+[pull request]: https://help.github.com/articles/creating-a-pull-request
Index: utils/google-benchmark/CONTRIBUTORS
===================================================================
--- /dev/null
+++ utils/google-benchmark/CONTRIBUTORS
@@ -0,0 +1,52 @@
+# People who have agreed to one of the CLAs and can contribute patches.
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+# https://developers.google.com/open-source/cla/individual
+# https://developers.google.com/open-source/cla/corporate
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+#
+# Names should be added to this file as:
+#     Name <email address>
+#
+# Please keep the list sorted.
+
+Albert Pretorius <pretoalb@gmail.com>
+Arne Beer <arne@twobeer.de>
+Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
+Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christopher Seymour <chris.j.seymour@hotmail.com>
+David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Dominic Hamon <dma@stripysock.com>
+Eric Fiselier <eric@efcs.ca>
+Eugene Zhuk <eugene.zhuk@gmail.com>
+Evgeny Safronov <division494@gmail.com>
+Felix Homann <linuxaudio@showlabor.de>
+Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+JianXiong Zhou <zhoujianxiong2@gmail.com>
+Jussi Knuuttila <jussi.knuuttila@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
+Kai Wolf <kai.wolf@gmail.com>
+Lei Xu <eddyxu@gmail.com>
+Matt Clarkson <mattyclarkson@gmail.com>
+Oleksandr Sochka <sasha.sochka@gmail.com>
+Pascal Leroy <phl@google.com>
+Paul Redmond <paul.redmond@gmail.com>
+Pierre Phaneuf <pphaneuf@google.com>
+Radoslav Yovchev <radoslav.tm@gmail.com>
+Shuo Chen <chenshuo@chenshuo.com>
+Yusuke Suzuki <utatane.tea@gmail.com>
+Tobias Ulvgård <tobias.ulvgard@dirac.se>
+Zbigniew Skowron <zbychs@gmail.com>
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
Index: utils/google-benchmark/LICENSE
===================================================================
--- /dev/null
+++ utils/google-benchmark/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
Index: utils/google-benchmark/README.LLVM
===================================================================
--- /dev/null
+++ utils/google-benchmark/README.LLVM
@@ -0,0 +1,6 @@
+LLVM notes
+----------
+
+This directory contains the Google Benchmark source code with some unnecessary
+files removed. Note that this directory is under a different license than
+libc++.
Index: utils/google-benchmark/README.md
===================================================================
--- /dev/null
+++ utils/google-benchmark/README.md
@@ -0,0 +1,510 @@
+# benchmark
+[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
+[![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
+[![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
+
+A library to support the benchmarking of functions, similar to unit-tests.
+
+Discussion group: https://groups.google.com/d/forum/benchmark-discuss
+
+IRC channel: https://freenode.net #googlebenchmark
+
+## Example usage
+### Basic usage
+Define a function that executes the code to be measured.
+
+```c++
+static void BM_StringCreation(benchmark::State& state) {
+  while (state.KeepRunning())
+    std::string empty_string;
+}
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  while (state.KeepRunning())
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+BENCHMARK_MAIN();
+```
+
+### Passing arguments
+Sometimes a family of benchmarks can be implemented with just one routine that
+takes an extra argument to specify which one of the family of benchmarks to
+run. For example, the following code defines a family of benchmarks for
+measuring the speed of `memcpy()` calls of different lengths:
+
+```c++
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range_x()];
+  char* dst = new char[state.range_x()];
+  memset(src, 'x', state.range_x());
+  while (state.KeepRunning())
+    memcpy(dst, src, state.range_x());
+  state.SetBytesProcessed(int64_t(state.iterations()) *
+                          int64_t(state.range_x()));
+  delete[] src;
+  delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+```
+
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following invocation will pick a few appropriate arguments in
+the specified range and will generate a benchmark for each such argument.
+
+```c++
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+```
+
+By default the arguments in the range are generated in multiples of eight and
+the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the
+range multiplier is changed to multiples of two.
+
+```c++
+BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
+```
+Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
+
+You might have a benchmark that depends on two inputs. For example, the
+following code defines a family of benchmarks for measuring the speed of set
+insertion.
+
+```c++
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    std::set<int> data = ConstructRandomSet(state.range_x());
+    state.ResumeTiming();
+    for (int j = 0; j < state.range_y(); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+    ->ArgPair(1<<10, 1)
+    ->ArgPair(1<<10, 8)
+    ->ArgPair(1<<10, 64)
+    ->ArgPair(1<<10, 512)
+    ->ArgPair(8<<10, 1)
+    ->ArgPair(8<<10, 8)
+    ->ArgPair(8<<10, 64)
+    ->ArgPair(8<<10, 512);
+```
+
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following macro will pick a few appropriate arguments in the
+product of the two specified ranges and will generate a benchmark for each such
+pair.
+
+```c++
+BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
+```
+
+For more complex patterns of inputs, passing a custom function to `Apply` allows
+programmatic specification of an arbitrary set of arguments on which to run the
+benchmark. The following example enumerates a dense range on one parameter,
+and a sparse range on the second.
+
+```c++
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->ArgPair(i, j);
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+```
+
+### Calculate asymptotic complexity (Big O)
+Asymptotic complexity might be calculated for a family of benchmarks. The
+following code will calculate the coefficient for the high-order term in the
+running time and the normalized root-mean square error of string comparison.
+
+```c++
+static void BM_StringCompare(benchmark::State& state) {
+  std::string s1(state.range_x(), '-');
+  std::string s2(state.range_x(), '-');
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(s1.compare(s2));
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
+```
+
+As shown in the following invocation, asymptotic complexity might also be
+calculated automatically.
+
+```c++
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity();
+```
+
+The following code will specify asymptotic complexity with a lambda function,
+that might be used to customize high-order term calculation.
+
+```c++
+BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
+    ->Range(1<<10, 1<<18)->Complexity([](int n)->double{return n; });
+```
+
+### Templated benchmarks
+Templated benchmarks work the same way: This example produces and consumes
+messages of size `sizeof(v)` `range_x` times. It also outputs throughput in the
+absence of multiprogramming.
+
+```c++
+template <class Q> int BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  while (state.KeepRunning()) {
+    for (int i = state.range_x(); i--; )
+      q.push(v);
+    for (int e = state.range_x(); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range_x());
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+```
+
+Three macros are provided for adding benchmark templates.
+
+```c++
+#if __cplusplus >= 201103L // C++11 and greater.
+#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
+#else // C++ < C++11
+#define BENCHMARK_TEMPLATE(func, arg1)
+#endif
+#define BENCHMARK_TEMPLATE1(func, arg1)
+#define BENCHMARK_TEMPLATE2(func, arg1, arg2)
+```
+
+## Passing arbitrary arguments to a benchmark
+In C++11 it is possible to define a benchmark that takes an arbitrary number
+of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
+macro creates a benchmark that invokes `func`  with the `benchmark::State` as
+the first argument followed by the specified `args...`.
+The `test_case_name` is appended to the name of the benchmark and
+should describe the values passed.
+
+```c++
+template <class ...ExtraArgs>`
+void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+  [...]
+}
+// Registers a benchmark named "BM_takes_args/int_string_test` that passes
+// the specified values to `extra_args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+```
+Note that elements of `...args` may refer to global variables. Users should
+avoid modifying global state inside of a benchmark.
+
+### Multithreaded benchmarks
+In a multithreaded test (benchmark invoked by multiple threads simultaneously),
+it is guaranteed that none of the threads will start until all have called
+`KeepRunning`, and all will have finished before KeepRunning returns false. As
+such, any global setup or teardown can be wrapped in a check against the thread
+index:
+
+```c++
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  while (state.KeepRunning()) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(2);
+```
+
+If the benchmarked code itself uses threads and you want to compare it to
+single-threaded code, you may want to use real-time ("wallclock") measurements
+for latency comparisons:
+
+```c++
+BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
+```
+
+Without `UseRealTime`, CPU time is used by default.
+
+
+## Manual timing
+For benchmarking something for which neither CPU time nor real-time are
+correct or accurate enough, completely manual timing is supported using
+the `UseManualTime` function. 
+
+When `UseManualTime` is used, the benchmarked code must call
+`SetIterationTime` once per iteration of the `KeepRunning` loop to
+report the manually measured time.
+
+An example use case for this is benchmarking GPU execution (e.g. OpenCL
+or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
+be accurately measured using CPU time or real-time. Instead, they can be
+measured accurately using a dedicated API, and these measurement results
+can be reported back with `SetIterationTime`.
+
+```c++
+static void BM_ManualTiming(benchmark::State& state) {
+  int microseconds = state.range_x();
+  std::chrono::duration<double, std::micro> sleep_duration {
+    static_cast<double>(microseconds)
+  };
+
+  while (state.KeepRunning()) {
+    auto start = std::chrono::high_resolution_clock::now();
+    // Simulate some useful workload with a sleep
+    std::this_thread::sleep_for(sleep_duration);
+    auto end   = std::chrono::high_resolution_clock::now();
+
+    auto elapsed_seconds =
+      std::chrono::duration_cast<std::chrono::duration<double>>(
+        end - start);
+
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+}
+BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
+```
+
+### Preventing optimisation
+To prevent a value or expression from being optimized away by the compiler
+the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
+functions can be used.
+
+```c++
+static void BM_test(benchmark::State& state) {
+  while (state.KeepRunning()) {
+      int x = 0;
+      for (int i=0; i < 64; ++i) {
+        benchmark::DoNotOptimize(x += i);
+      }
+  }
+}
+```
+
+`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
+memory or a register. For GNU based compilers it acts as read/write barrier
+for global memory. More specifically it forces the compiler to flush pending
+writes to memory and reload any other values as necessary.
+
+Note that `DoNotOptimize(<expr>)` does not prevent optimizations on `<expr>`
+in any way. `<expr>` may even be removed entirely when the result is already
+known. For example:
+
+```c++
+  /* Example 1: `<expr>` is removed entirely. */
+  int foo(int x) { return x + 42; }
+  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
+
+  /*  Example 2: Result of '<expr>' is only reused */
+  int bar(int) __attribute__((const));
+  while (...) DoNotOptimize(bar(0)); // Optimized to:
+  // int __result__ = bar(0);
+  // while (...) DoNotOptimize(__result__);
+```
+
+The second tool for preventing optimizations is `ClobberMemory()`. In essence
+`ClobberMemory()` forces the compiler to perform all pending writes to global
+memory. Memory managed by block scope objects must be "escaped" using
+`DoNotOptimize(...)` before it can be clobbered. In the below example
+`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized
+away.
+
+```c++
+static void BM_vector_push_back(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    std::vector<int> v;
+    v.reserve(1);
+    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
+    v.push_back(42);
+    benchmark::ClobberMemory(); // Force 42 to be written to memory.
+  }
+}
+```
+
+Note that `ClobberMemory()` is only available for GNU based compilers.
+
+### Set time unit manually
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+```c++
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+```
+
+## Controlling number of iterations
+In all cases, the number of iterations for which the benchmark is run is
+governed by the amount of time the benchmark takes. Concretely, the number of
+iterations is at least one, not more than 1e9, until CPU time is greater than
+the minimum time, or the wallclock time is 5x minimum time. The minimum time is
+set as a flag `--benchmark_min_time` or per benchmark by calling `MinTime` on
+the registered benchmark object.
+
+## Reporting the mean and standard devation by repeated benchmarks
+By default each benchmark is run once and that single result is reported.
+However benchmarks are often noisy and a single result may not be representative
+of the overall behavior. For this reason it's possible to repeatedly rerun the
+benchmark.
+
+The number of runs of each benchmark is specified globally by the
+`--benchmark_repetitions` flag or on a per benchmark basis by calling
+`Repetitions` on the registered benchmark object. When a benchmark is run
+more than once the mean and standard deviation of the runs will be reported.
+
+## Fixtures
+Fixture tests are created by
+first defining a type that derives from ::benchmark::Fixture and then
+creating/registering the tests using the following macros:
+
+* `BENCHMARK_F(ClassName, Method)`
+* `BENCHMARK_DEFINE_F(ClassName, Method)`
+* `BENCHMARK_REGISTER_F(ClassName, Method)`
+
+For Example:
+
+```c++
+class MyFixture : public benchmark::Fixture {};
+
+BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
+   while (st.KeepRunning()) {
+     ...
+  }
+}
+
+BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
+   while (st.KeepRunning()) {
+     ...
+  }
+}
+/* BarTest is NOT registered */
+BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
+/* BarTest is now registered */
+```
+
+## Exiting Benchmarks in Error
+
+When errors caused by external influences, such as file I/O and network
+communication, occur within a benchmark the
+`State::SkipWithError(const char* msg)` function can be used to skip that run
+of benchmark and report the error. Note that only future iterations of the
+`KeepRunning()` are skipped. Users may explicitly return to exit the
+benchmark immediately.
+
+The `SkipWithError(...)` function may be used at any point within the benchmark,
+including before and after the `KeepRunning()` loop.
+
+For example:
+
+```c++
+static void BM_test(benchmark::State& state) {
+  auto resource = GetResource();
+  if (!resource.good()) {
+      state.SkipWithError("Resource is not good!");
+      // KeepRunning() loop will not be entered.
+  }
+  while (state.KeepRunning()) {
+      auto data = resource.read_data();
+      if (!resource.good()) {
+        state.SkipWithError("Failed to read data!");
+        break; // Needed to skip the rest of the iteration.
+     }
+     do_stuff(data);
+  }
+}
+```
+
+## Output Formats
+The library supports multiple output formats. Use the
+`--benchmark_format=<tabular|json|csv>` flag to set the format type. `tabular` is
+the default format.
+
+The Tabular format is intended to be a human readable format. By default
+the format generates color output. Context is output on stderr and the 
+tabular data on stdout. Example tabular output looks like:
+```
+Benchmark                               Time(ns)    CPU(ns) Iterations
+----------------------------------------------------------------------
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+```
+
+The JSON format outputs human readable json split into two top level attributes.
+The `context` attribute contains information about the run in general, including
+information about the CPU and the date.
+The `benchmarks` attribute contains a list of ever benchmark run. Example json
+output looks like:
+``` json
+{
+  "context": {
+    "date": "2015/03/17-18:40:25",
+    "num_cpus": 40,
+    "mhz_per_cpu": 2801,
+    "cpu_scaling_enabled": false,
+    "build_type": "debug"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SetInsert/1024/1",
+      "iterations": 94877,
+      "real_time": 29275,
+      "cpu_time": 29836,
+      "bytes_per_second": 134066,
+      "items_per_second": 33516
+    },
+    {
+      "name": "BM_SetInsert/1024/8",
+      "iterations": 21609,
+      "real_time": 32317,
+      "cpu_time": 32429,
+      "bytes_per_second": 986770,
+      "items_per_second": 246693
+    },
+    {
+      "name": "BM_SetInsert/1024/10",
+      "iterations": 21393,
+      "real_time": 32724,
+      "cpu_time": 33355,
+      "bytes_per_second": 1199226,
+      "items_per_second": 299807
+    }
+  ]
+}
+```
+
+The CSV format outputs comma-separated values. The `context` is output on stderr
+and the CSV itself on stdout. Example CSV output looks like:
+```
+name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
+"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
+"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
+"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
+```
+
+## Debug vs Release
+By default, benchmark builds as a debug library. You will see a warning in the output when this is the case. To build it as a release library instead, use:
+
+```
+cmake -DCMAKE_BUILD_TYPE=Release
+```
+
+To enable link-time optimisation, use
+
+```
+cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
+```
+
+## Linking against the library
+When using gcc, it is necessary to link against pthread to avoid runtime exceptions. This is due to how gcc implements std::thread. See [issue #67](https://github.com/google/benchmark/issues/67) for more details.
Index: utils/google-benchmark/cmake/AddCXXCompilerFlag.cmake
===================================================================
--- /dev/null
+++ utils/google-benchmark/cmake/AddCXXCompilerFlag.cmake
@@ -0,0 +1,37 @@
+# - Adds a compiler flag if it is supported by the compiler
+#
+# This function checks that the supplied compiler flag is supported and then
+# adds it to the corresponding compiler flags
+#
+#  add_cxx_compiler_flag(<FLAG> [<VARIANT>])
+#
+# - Example
+#
+# include(AddCXXCompilerFlag)
+# add_cxx_compiler_flag(-Wall)
+# add_cxx_compiler_flag(-no-strict-aliasing RELEASE)
+# Requires CMake 2.6+
+
+if(__add_cxx_compiler_flag)
+  return()
+endif()
+set(__add_cxx_compiler_flag INCLUDED)
+
+include(CheckCXXCompilerFlag)
+
+function(add_cxx_compiler_flag FLAG)
+  string(TOUPPER "HAVE_CXX_FLAG_${FLAG}" SANITIZED_FLAG)
+  string(REPLACE "+" "X" SANITIZED_FLAG ${SANITIZED_FLAG})
+  string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
+  string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${FLAG}")
+  check_cxx_compiler_flag("" ${SANITIZED_FLAG})
+  if(${SANITIZED_FLAG})
+    set(VARIANT ${ARGV1})
+    if(ARGV1)
+      string(TOUPPER "_${VARIANT}" VARIANT)
+    endif()
+    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+  endif()
+endfunction()
+
Index: utils/google-benchmark/cmake/CXXFeatureCheck.cmake
===================================================================
--- /dev/null
+++ utils/google-benchmark/cmake/CXXFeatureCheck.cmake
@@ -0,0 +1,42 @@
+# - Compile and run code to check for C++ features
+#
+# This functions compiles a source file under the `cmake` folder
+# and adds the corresponding `HAVE_[FILENAME]` flag to the CMake
+# environment
+#
+#  cxx_feature_check(<FLAG> [<VARIANT>])
+#
+# - Example
+#
+# include(CXXFeatureCheck)
+# cxx_feature_check(STD_REGEX)
+# Requires CMake 2.6+
+
+if(__cxx_feature_check)
+  return()
+endif()
+set(__cxx_feature_check INCLUDED)
+
+function(cxx_feature_check FILE)
+  string(TOLOWER ${FILE} FILE)
+  string(TOUPPER ${FILE} VAR)
+  string(TOUPPER "HAVE_${VAR}" FEATURE)
+  if (DEFINED HAVE_${VAR})
+    return()
+  endif()
+  message("-- Performing Test ${FEATURE}")
+  try_run(RUN_${FEATURE} COMPILE_${FEATURE}
+          ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp)
+  if(RUN_${FEATURE} EQUAL 0)
+    message("-- Performing Test ${FEATURE} -- success")
+    set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE)
+    add_definitions(-DHAVE_${VAR})
+  else()
+    if(NOT COMPILE_${FEATURE})
+      message("-- Performing Test ${FEATURE} -- failed to compile")
+    else()
+      message("-- Performing Test ${FEATURE} -- compiled but failed to run")
+    endif()
+  endif()
+endfunction()
+
Index: utils/google-benchmark/cmake/GetGitVersion.cmake
===================================================================
--- /dev/null
+++ utils/google-benchmark/cmake/GetGitVersion.cmake
@@ -0,0 +1,51 @@
+# - Returns a version string from Git tags
+#
+# This function inspects the annotated git tags for the project and returns a string
+# into a CMake variable
+#
+#  get_git_version(<var>)
+#
+# - Example
+#
+# include(GetGitVersion)
+# get_git_version(GIT_VERSION)
+#
+# Requires CMake 2.8.11+
+find_package(Git)
+
+if(__get_git_version)
+  return()
+endif()
+set(__get_git_version INCLUDED)
+
+function(get_git_version var)
+  if(GIT_EXECUTABLE)
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+          RESULT_VARIABLE status
+          OUTPUT_VARIABLE GIT_VERSION
+          ERROR_QUIET)
+      if(${status})
+          set(GIT_VERSION "v0.0.0")
+      else()
+          string(STRIP ${GIT_VERSION} GIT_VERSION)
+          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+      endif()
+
+      # Work out if the repository is dirty
+      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
+          OUTPUT_QUIET
+          ERROR_QUIET)
+      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
+          OUTPUT_VARIABLE GIT_DIFF_INDEX
+          ERROR_QUIET)
+      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
+      if (${GIT_DIRTY})
+          set(GIT_VERSION "${GIT_VERSION}-dirty")
+      endif()
+  else()
+      set(GIT_VERSION "v0.0.0")
+  endif()
+
+  message("-- git Version: ${GIT_VERSION}")
+  set(${var} ${GIT_VERSION} PARENT_SCOPE)
+endfunction()
Index: utils/google-benchmark/cmake/gnu_posix_regex.cpp
===================================================================
--- /dev/null
+++ utils/google-benchmark/cmake/gnu_posix_regex.cpp
@@ -0,0 +1,12 @@
+#include <gnuregex.h>
+#include <string>
+int main() {
+  std::string str = "test0159";
+  regex_t re;
+  int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    return ec;
+  }
+  return regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
+}
+
Index: utils/google-benchmark/cmake/posix_regex.cpp
===================================================================
--- /dev/null
+++ utils/google-benchmark/cmake/posix_regex.cpp
@@ -0,0 +1,14 @@
+#include <regex.h>
+#include <string>
+int main() {
+  std::string str = "test0159";
+  regex_t re;
+  int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    return ec;
+  }
+  int ret = regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
+  regfree(&re);
+  return ret;
+}
+
Index: utils/google-benchmark/cmake/std_regex.cpp
===================================================================
--- /dev/null
+++ utils/google-benchmark/cmake/std_regex.cpp
@@ -0,0 +1,10 @@
+#include <regex>
+#include <string>
+int main() {
+  const std::string str = "test0159";
+  std::regex re;
+  re = std::regex("^[a-z]+[0-9]+$",
+       std::regex_constants::extended | std::regex_constants::nosubs);
+  return std::regex_search(str, re) ? 0 : -1;
+}
+
Index: utils/google-benchmark/cmake/steady_clock.cpp
===================================================================
--- /dev/null
+++ utils/google-benchmark/cmake/steady_clock.cpp
@@ -0,0 +1,7 @@
+#include <chrono>
+
+int main() {
+    typedef std::chrono::steady_clock Clock;
+    Clock::time_point tp = Clock::now();
+    ((void)tp);
+}
Index: utils/google-benchmark/cmake/thread_safety_attributes.cpp
===================================================================
--- /dev/null
+++ utils/google-benchmark/cmake/thread_safety_attributes.cpp
@@ -0,0 +1,4 @@
+#define HAVE_THREAD_SAFETY_ATTRIBUTES
+#include "../src/mutex.h"
+
+int main() {}
Index: utils/google-benchmark/include/benchmark/benchmark.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/include/benchmark/benchmark.h
@@ -0,0 +1,21 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BENCHMARK_BENCHMARK_H_
+#define BENCHMARK_BENCHMARK_H_
+
+#include "macros.h"
+#include "benchmark_api.h"
+#include "reporter.h"
+
+#endif // BENCHMARK_BENCHMARK_H_
Index: utils/google-benchmark/include/benchmark/benchmark_api.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/include/benchmark/benchmark_api.h
@@ -0,0 +1,747 @@
+// Support for registering benchmarks for functions.
+
+/* Example usage:
+// Define a function that executes the code to be measured a
+// specified number of times:
+static void BM_StringCreation(benchmark::State& state) {
+  while (state.KeepRunning())
+    std::string empty_string;
+}
+
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  while (state.KeepRunning())
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+// Augment the main() program to invoke benchmarks if specified
+// via the --benchmarks command line flag.  E.g.,
+//       my_unittest --benchmark_filter=all
+//       my_unittest --benchmark_filter=BM_StringCreation
+//       my_unittest --benchmark_filter=String
+//       my_unittest --benchmark_filter='Copy|Creation'
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
+
+// Sometimes a family of microbenchmarks can be implemented with
+// just one routine that takes an extra argument to specify which
+// one of the family of benchmarks to run.  For example, the following
+// code defines a family of microbenchmarks for measuring the speed
+// of memcpy() calls of different lengths:
+
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
+  memset(src, 'x', state.range_x());
+  while (state.KeepRunning())
+    memcpy(dst, src, state.range_x());
+  state.SetBytesProcessed(int64_t(state.iterations()) *
+                          int64_t(state.range_x()));
+  delete[] src; delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+
+// The preceding code is quite repetitive, and can be replaced with the
+// following short-hand.  The following invocation will pick a few
+// appropriate arguments in the specified range and will generate a
+// microbenchmark for each such argument.
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+
+// You might have a microbenchmark that depends on two inputs.  For
+// example, the following code defines a family of microbenchmarks for
+// measuring the speed of set insertion.
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    set<int> data = ConstructRandomSet(state.range_x());
+    state.ResumeTiming();
+    for (int j = 0; j < state.range_y(); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+   ->ArgPair(1<<10, 1)
+   ->ArgPair(1<<10, 8)
+   ->ArgPair(1<<10, 64)
+   ->ArgPair(1<<10, 512)
+   ->ArgPair(8<<10, 1)
+   ->ArgPair(8<<10, 8)
+   ->ArgPair(8<<10, 64)
+   ->ArgPair(8<<10, 512);
+
+// The preceding code is quite repetitive, and can be replaced with
+// the following short-hand.  The following macro will pick a few
+// appropriate arguments in the product of the two specified ranges
+// and will generate a microbenchmark for each such pair.
+BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
+
+// For more complex patterns of inputs, passing a custom function
+// to Apply allows programmatic specification of an
+// arbitrary set of arguments to run the microbenchmark on.
+// The following example enumerates a dense range on
+// one parameter, and a sparse range on the second.
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->ArgPair(i, j);
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+
+// Templated microbenchmarks work the same way:
+// Produce then consume 'size' messages 'iters' times
+// Measures throughput in the absence of multiprogramming.
+template <class Q> int BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  while (state.KeepRunning()) {
+    for (int i = state.range_x(); i--; )
+      q.push(v);
+    for (int e = state.range_x(); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range_x());
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+Use `Benchmark::MinTime(double t)` to set the minimum time used to run the
+benchmark. This option overrides the `benchmark_min_time` flag.
+
+void BM_test(benchmark::State& state) {
+ ... body ...
+}
+BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds.
+
+In a multithreaded test, it is guaranteed that none of the threads will start
+until all have called KeepRunning, and all will have finished before KeepRunning
+returns false. As such, any global setup or teardown you want to do can be
+wrapped in a check against the thread index:
+
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  while (state.KeepRunning()) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(4);
+
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+*/
+
+#ifndef BENCHMARK_BENCHMARK_API_H_
+#define BENCHMARK_BENCHMARK_API_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "macros.h"
+
+namespace benchmark {
+class BenchmarkReporter;
+
+void Initialize(int* argc, char** argv);
+
+// Generate a list of benchmarks matching the specified --benchmark_filter flag
+// and if --benchmark_list_tests is specified return after printing the name
+// of each matching benchmark. Otherwise run each matching benchmark and
+// report the results.
+//
+// The second overload reports the results using the specified 'reporter'.
+//
+// RETURNS: The number of matching benchmarks.
+size_t RunSpecifiedBenchmarks();
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* reporter);
+
+
+// If this routine is called, peak memory allocation past this point in the
+// benchmark is reported at the end of the benchmark report line. (It is
+// computed by running the benchmark once with a single iteration and a memory
+// tracer.)
+// TODO(dominic)
+// void MemoryUsage();
+
+namespace internal {
+class Benchmark;
+class BenchmarkImp;
+class BenchmarkFamilies;
+
+template <class T> struct Voider {
+    typedef void type;
+};
+
+template <class T, class = void>
+struct EnableIfString {};
+
+template <class T>
+struct EnableIfString<T, typename Voider<typename T::basic_string>::type> {
+    typedef int type;
+};
+
+void UseCharPointer(char const volatile*);
+
+// Take ownership of the pointer and register the benchmark. Return the
+// registered benchmark.
+Benchmark* RegisterBenchmarkInternal(Benchmark*);
+
+} // end namespace internal
+
+
+// The DoNotOptimize(...) function can be used to prevent a value or
+// expression from being optimized away by the compiler. This function is
+// intended to add little to no overhead.
+// See: https://youtu.be/nXaxk27zwlk?t=2441
+#if defined(__GNUC__)
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+    asm volatile("" : : "g"(value) : "memory");
+}
+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+    asm volatile("" : : : "memory");
+}
+#else
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+    internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+}
+// FIXME Add ClobberMemory() for non-gnu compilers
+#endif
+
+// TimeUnit is passed to a benchmark in order to specify the order of magnitude
+// for the measured time.
+enum TimeUnit {
+  kNanosecond,
+  kMicrosecond,
+  kMillisecond
+};
+
+// BigO is passed to a benchmark in order to specify the asymptotic computational 
+// complexity for the benchmark. In case oAuto is selected, complexity will be 
+// calculated automatically to the best fit.
+enum BigO {
+  oNone,
+  o1,
+  oN,
+  oNSquared,
+  oNCubed,
+  oLogN,
+  oNLogN,
+  oAuto,
+  oLambda
+};
+
+// BigOFunc is passed to a benchmark in order to specify the asymptotic 
+// computational complexity for the benchmark.
+typedef double(BigOFunc)(int);
+
+// State is passed to a running Benchmark and contains state for the
+// benchmark to use.
+class State {
+public:
+  State(size_t max_iters, bool has_x, int x, bool has_y, int y,
+        int thread_i, int n_threads);
+
+  // Returns true if the benchmark should continue through another iteration.
+  // NOTE: A benchmark may not return from the test until KeepRunning() has
+  // returned false.
+  bool KeepRunning() {
+    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
+      assert(!finished_);
+      started_ = true;
+      ResumeTiming();
+    }
+    bool const res = total_iterations_++ < max_iterations;
+    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
+      assert(started_ && (!finished_ || error_occurred_));
+      if (!error_occurred_) {
+        PauseTiming();
+      }
+      // Total iterations now is one greater than max iterations. Fix this.
+      total_iterations_ = max_iterations;
+      finished_ = true;
+    }
+    return res;
+  }
+
+  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
+  // Stop the benchmark timer.  If not called, the timer will be
+  // automatically stopped after KeepRunning() returns false for the first time.
+  //
+  // For threaded benchmarks the PauseTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all active threads have made their ith call.
+  // The timer will stop when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void PauseTiming();
+
+  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
+  // Start the benchmark timer.  The timer is NOT running on entrance to the
+  // benchmark function. It begins running after the first call to KeepRunning()
+  //
+  // For threaded benchmarks the ResumeTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all active threads have made their ith call.
+  // The timer will start when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void ResumeTiming();
+
+  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
+  //            current thread.
+  // Skip any future iterations of the 'KeepRunning()' loop in the current
+  // thread and report an error with the specified 'msg'. After this call
+  // the user may explicitly 'return' from the benchmark.
+  //
+  // For threaded benchmarks only the current thread stops executing. If
+  // multiple threads report an error only the first error message is used.
+  // The current thread is no longer considered 'active' by
+  // 'PauseTiming()' and 'ResumingTiming()'.
+  //
+  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
+  // the current scope immediately. If the function is called from within
+  // the 'KeepRunning()' loop the current iteration will finish. It is the users
+  // responsibility to exit the scope as needed.
+  void SkipWithError(const char* msg);
+
+  // REQUIRES: called exactly once per iteration of the KeepRunning loop.
+  // Set the manually measured time for this benchmark iteration, which
+  // is used instead of automatically measured time if UseManualTime() was
+  // specified.
+  //
+  // For threaded benchmarks the SetIterationTime() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The time will be set by the last thread to call this function.
+  void SetIterationTime(double seconds);
+
+  // Set the number of bytes processed by the current benchmark
+  // execution.  This routine is typically called once at the end of a
+  // throughput oriented benchmark.  If this routine is called with a
+  // value > 0, the report is printed in MB/sec instead of nanoseconds
+  // per iteration.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetBytesProcessed(size_t bytes) {
+    bytes_processed_ = bytes;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t bytes_processed() const {
+    return bytes_processed_;
+  }
+
+  // If this routine is called with complexity_n > 0 and complexity report is requested for the 
+  // family benchmark, then current benchmark will be part of the computation and complexity_n will
+  // represent the length of N.
+  BENCHMARK_ALWAYS_INLINE
+  void SetComplexityN(int complexity_n) {
+    complexity_n_ = complexity_n;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t complexity_length_n() {
+    return complexity_n_;
+  }
+
+  // If this routine is called with items > 0, then an items/s
+  // label is printed on the benchmark report line for the currently
+  // executing benchmark. It is typically called at the end of a processing
+  // benchmark where a processing items/second output is desired.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetItemsProcessed(size_t items) {
+    items_processed_ = items;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t items_processed() const {
+    return items_processed_;
+  }
+
+  // If this routine is called, the specified label is printed at the
+  // end of the benchmark report line for the currently executing
+  // benchmark.  Example:
+  //  static void BM_Compress(benchmark::State& state) {
+  //    ...
+  //    double compress = input_size / output_size;
+  //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
+  //  }
+  // Produces output that looks like:
+  //  BM_Compress   50         50   14115038  compress:27.3%
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  void SetLabel(const char* label);
+
+  // Allow the use of std::string without actually including <string>.
+  // This function does not participate in overload resolution unless StringType
+  // has the nested typename `basic_string`. This typename should be provided
+  // as an injected class name in the case of std::string.
+  template <class StringType>
+  void SetLabel(StringType const & str,
+                typename internal::EnableIfString<StringType>::type = 1) {
+    this->SetLabel(str.c_str());
+  }
+
+  // Range arguments for this run. CHECKs if the argument has been set.
+  BENCHMARK_ALWAYS_INLINE
+  int range_x() const {
+    assert(has_range_x_);
+    ((void)has_range_x_); // Prevent unused warning.
+    return range_x_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  int range_y() const {
+    assert(has_range_y_);
+    ((void)has_range_y_); // Prevent unused warning.
+    return range_y_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t iterations() const { return total_iterations_; }
+
+private:
+  bool started_;
+  bool finished_;
+  size_t total_iterations_;
+
+  bool has_range_x_;
+  int range_x_;
+
+  bool has_range_y_;
+  int range_y_;
+
+  size_t bytes_processed_;
+  size_t items_processed_;
+
+  int complexity_n_;
+
+public:
+  // FIXME: Make this private somehow.
+  bool error_occurred_;
+public:
+  // Index of the executing thread. Values from [0, threads).
+  const int thread_index;
+  // Number of threads concurrently executing the benchmark.
+  const int threads;
+  const size_t max_iterations;
+
+private:
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
+};
+
+namespace internal {
+
+typedef void(Function)(State&);
+
+// ------------------------------------------------------
+// Benchmark registration object.  The BENCHMARK() macro expands
+// into an internal::Benchmark* object.  Various methods can
+// be called on this object to change the properties of the benchmark.
+// Each method returns "this" so that multiple method calls can
+// chained into one expression.
+class Benchmark {
+public:
+  virtual ~Benchmark();
+
+  // Note: the following methods all return "this" so that multiple
+  // method calls can be chained together in one expression.
+
+  // Run this benchmark once with "x" as the extra argument passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Arg(int x);
+
+  // Run this benchmark with the given time unit for the generated output report
+  Benchmark* Unit(TimeUnit unit);
+
+  // Run this benchmark once for a number of values picked from the
+  // range [start..limit].  (start and limit are always picked.)
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Range(int start, int limit);
+
+  // Run this benchmark once for every value in the range [start..limit]
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* DenseRange(int start, int limit);
+
+  // Run this benchmark once with "x,y" as the extra arguments passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
+  Benchmark* ArgPair(int x, int y);
+
+  // Pick a set of values A from the range [lo1..hi1] and a set
+  // of values B from the range [lo2..hi2].  Run the benchmark for
+  // every pair of values in the cartesian product of A and B
+  // (i.e., for all combinations of the values in A and B).
+  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
+  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
+
+  // Pass this benchmark object to *func, which can customize
+  // the benchmark by calling various methods like Arg, ArgPair,
+  // Threads, etc.
+  Benchmark* Apply(void (*func)(Benchmark* benchmark));
+
+  // Set the range multiplier for non-dense range. If not called, the range multiplier 
+  // kRangeMultiplier will be used.
+  Benchmark* RangeMultiplier(int multiplier);
+
+  // Set the minimum amount of time to use when running this benchmark. This
+  // option overrides the `benchmark_min_time` flag.
+  // REQUIRES: `t > 0`
+  Benchmark* MinTime(double t);
+
+  // Specify the amount of times to repeat this benchmark. This option overrides
+  // the `benchmark_repetitions` flag.
+  // REQUIRES: `n > 0`
+  Benchmark* Repetitions(int n);
+
+  // If a particular benchmark is I/O bound, runs multiple threads internally or
+  // if for some reason CPU timings are not representative, call this method. If
+  // called, the elapsed time will be used to control how many iterations are
+  // run, and in the printing of items/second or MB/seconds values.  If not
+  // called, the cpu time used by the benchmark will be used.
+  Benchmark* UseRealTime();
+
+  // If a benchmark must measure time manually (e.g. if GPU execution time is being
+  // measured), call this method. If called, each benchmark iteration should call
+  // SetIterationTime(seconds) to report the measured time, which will be used
+  // to control how many iterations are run, and in the printing of items/second
+  // or MB/second values.
+  Benchmark* UseManualTime();
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output. 
+  Benchmark* Complexity(BigO complexity = benchmark::oAuto);
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output.
+  Benchmark* Complexity(BigOFunc* complexity);
+
+  // Support for running multiple copies of the same benchmark concurrently
+  // in multiple threads.  This may be useful when measuring the scaling
+  // of some piece of code.
+
+  // Run one instance of this benchmark concurrently in t threads.
+  Benchmark* Threads(int t);
+
+  // Pick a set of values T from [min_threads,max_threads].
+  // min_threads and max_threads are always included in T.  Run this
+  // benchmark once for each value in T.  The benchmark run for a
+  // particular value t consists of t threads running the benchmark
+  // function concurrently.  For example, consider:
+  //    BENCHMARK(Foo)->ThreadRange(1,16);
+  // This will run the following benchmarks:
+  //    Foo in 1 thread
+  //    Foo in 2 threads
+  //    Foo in 4 threads
+  //    Foo in 8 threads
+  //    Foo in 16 threads
+  Benchmark* ThreadRange(int min_threads, int max_threads);
+
+  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
+  Benchmark* ThreadPerCpu();
+
+  virtual void Run(State& state) = 0;
+
+  // Used inside the benchmark implementation
+  struct Instance;
+
+protected:
+  explicit Benchmark(const char* name);
+  Benchmark(Benchmark const&);
+  void SetName(const char* name);
+
+private:
+  friend class BenchmarkFamilies;
+  BenchmarkImp* imp_;
+
+  Benchmark& operator=(Benchmark const&);
+};
+
+// The class used to hold all Benchmarks created from static function.
+// (ie those created using the BENCHMARK(...) macros.
+class FunctionBenchmark : public Benchmark {
+public:
+    FunctionBenchmark(const char* name, Function* func)
+        : Benchmark(name), func_(func)
+    {}
+
+    virtual void Run(State& st);
+private:
+    Function* func_;
+};
+
+}  // end namespace internal
+
+// The base class for all fixture tests.
+class Fixture: public internal::Benchmark {
+public:
+    Fixture() : internal::Benchmark("") {}
+
+    virtual void Run(State& st) {
+      this->SetUp(st);
+      this->BenchmarkCase(st);
+      this->TearDown(st);
+    }
+
+    virtual void SetUp(const State&) {}
+    virtual void TearDown(const State&) {}
+
+protected:
+    virtual void BenchmarkCase(State&) = 0;
+};
+
+}  // end namespace benchmark
+
+
+// ------------------------------------------------------
+// Macro to register benchmarks
+
+// Check that __COUNTER__ is defined and that __COUNTER__ increases by 1
+// every time it is expanded. X + 1 == X + 0 is used in case X is defined to be
+// empty. If X is empty the expression becomes (+1 == +0).
+#if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0)
+#define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__
+#else
+#define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__
+#endif
+
+// Helpers for generating unique variable names
+#define BENCHMARK_PRIVATE_NAME(n) \
+    BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
+#define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
+
+#define BENCHMARK_PRIVATE_DECLARE(n)       \
+  static ::benchmark::internal::Benchmark* \
+  BENCHMARK_PRIVATE_NAME(n) BENCHMARK_UNUSED
+
+#define BENCHMARK(n) \
+    BENCHMARK_PRIVATE_DECLARE(n) =                               \
+        (::benchmark::internal::RegisterBenchmarkInternal(       \
+            new ::benchmark::internal::FunctionBenchmark(#n, n)))
+
+// Old-style macros
+#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
+#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2))
+#define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t))
+#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
+#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
+  BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2))
+
+#if __cplusplus >= 201103L
+
+// Register a benchmark which invokes the function specified by `func`
+// with the additional arguments specified by `...`.
+//
+// For example:
+//
+// template <class ...ExtraArgs>`
+// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+//  [...]
+//}
+// /* Registers a benchmark named "BM_takes_args/int_string_test` */
+// BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+#define BENCHMARK_CAPTURE(func, test_case_name, ...)                       \
+    BENCHMARK_PRIVATE_DECLARE(func) =                                      \
+        (::benchmark::internal::RegisterBenchmarkInternal(                 \
+            new ::benchmark::internal::FunctionBenchmark(                  \
+                    #func "/" #test_case_name,                             \
+                    [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
+
+#endif // __cplusplus >= 11
+
+// This will register a benchmark for a templatized function.  For example:
+//
+// template<int arg>
+// void BM_Foo(int iters);
+//
+// BENCHMARK_TEMPLATE(BM_Foo, 1);
+//
+// will register BM_Foo<1> as a benchmark.
+#define BENCHMARK_TEMPLATE1(n, a) \
+  BENCHMARK_PRIVATE_DECLARE(n) =  \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+        new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
+
+#define BENCHMARK_TEMPLATE2(n, a, b)                     \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+        new ::benchmark::internal::FunctionBenchmark(    \
+            #n "<" #a "," #b ">", n<a, b>)))
+
+#if __cplusplus >= 201103L
+#define BENCHMARK_TEMPLATE(n, ...)           \
+  BENCHMARK_PRIVATE_DECLARE(n) =             \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+        new ::benchmark::internal::FunctionBenchmark( \
+        #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
+#else
+#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
+#endif
+
+
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)      \
+class BaseClass##_##Method##_Benchmark : public BaseClass { \
+public:                                                     \
+    BaseClass##_##Method##_Benchmark() : BaseClass() {      \
+        this->SetName(#BaseClass "/" #Method);}             \
+protected:                                                  \
+    virtual void BenchmarkCase(::benchmark::State&);        \
+};
+
+#define BENCHMARK_DEFINE_F(BaseClass, Method) \
+    BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+    void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#define BENCHMARK_REGISTER_F(BaseClass, Method) \
+    BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
+
+#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
+    BENCHMARK_PRIVATE_DECLARE(TestName) = \
+        (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
+
+// This macro will define and register a benchmark within a fixture class.
+#define BENCHMARK_F(BaseClass, Method) \
+    BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+    BENCHMARK_REGISTER_F(BaseClass, Method); \
+    void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+
+// Helper macro to create a main routine in a test that runs the benchmarks
+#define BENCHMARK_MAIN()                   \
+  int main(int argc, char** argv) {        \
+    ::benchmark::Initialize(&argc, argv);  \
+    ::benchmark::RunSpecifiedBenchmarks(); \
+  }
+
+#endif  // BENCHMARK_BENCHMARK_API_H_
Index: utils/google-benchmark/include/benchmark/macros.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/include/benchmark/macros.h
@@ -0,0 +1,56 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BENCHMARK_MACROS_H_
+#define BENCHMARK_MACROS_H_
+
+#if __cplusplus < 201103L
+# define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName)  \
+    TypeName(const TypeName&);                         \
+    TypeName& operator=(const TypeName&)
+#else
+# define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName)  \
+    TypeName(const TypeName&) = delete;                \
+    TypeName& operator=(const TypeName&) = delete
+#endif
+
+#if defined(__GNUC__)
+# define BENCHMARK_UNUSED __attribute__((unused))
+# define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
+# define BENCHMARK_NOEXCEPT noexcept
+# define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+# define BENCHMARK_UNUSED
+# define BENCHMARK_ALWAYS_INLINE __forceinline
+# if _MSC_VER >= 1900
+#  define BENCHMARK_NOEXCEPT noexcept
+#  define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+# else
+#  define BENCHMARK_NOEXCEPT
+#  define BENCHMARK_NOEXCEPT_OP(x)
+# endif
+# define __func__ __FUNCTION__
+#else
+# define BENCHMARK_UNUSED
+# define BENCHMARK_ALWAYS_INLINE
+# define BENCHMARK_NOEXCEPT
+# define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
+#if defined(__GNUC__)
+# define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#else
+# define BENCHMARK_BUILTIN_EXPECT(x, y) x
+#endif
+
+#endif  // BENCHMARK_MACROS_H_
Index: utils/google-benchmark/include/benchmark/reporter.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/include/benchmark/reporter.h
@@ -0,0 +1,216 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BENCHMARK_REPORTER_H_
+#define BENCHMARK_REPORTER_H_
+
+#include <cassert>
+#include <iosfwd>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "benchmark_api.h"  // For forward declaration of BenchmarkReporter
+
+namespace benchmark {
+
+// Interface for custom benchmark result printers.
+// By default, benchmark reports are printed to stdout. However an application
+// can control the destination of the reports by calling
+// RunSpecifiedBenchmarks and passing it a custom reporter object.
+// The reporter object must implement the following interface.
+class BenchmarkReporter {
+ public:
+  struct Context {
+    int num_cpus;
+    double mhz_per_cpu;
+    bool cpu_scaling_enabled;
+
+    // The number of chars in the longest benchmark name.
+    size_t name_field_width;
+  };
+
+  struct Run {
+    Run() :
+      error_occurred(false),
+      iterations(1),
+      time_unit(kNanosecond),
+      real_accumulated_time(0),
+      cpu_accumulated_time(0),
+      bytes_per_second(0),
+      items_per_second(0),
+      max_heapbytes_used(0),
+      complexity(oNone),
+      complexity_n(0),
+      report_big_o(false),
+      report_rms(false) {}
+
+    std::string benchmark_name;
+    std::string report_label;  // Empty if not set by benchmark.
+    bool error_occurred;
+    std::string error_message;
+
+    int64_t iterations;
+    TimeUnit time_unit;
+    double real_accumulated_time;
+    double cpu_accumulated_time;
+
+    // Return a value representing the real time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedRealTime() const;
+
+    // Return a value representing the cpu time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedCPUTime() const;
+
+    // Zero if not set by benchmark.
+    double bytes_per_second;
+    double items_per_second;
+
+    // This is set to 0.0 if memory tracing is not enabled.
+    double max_heapbytes_used;
+
+    // Keep track of arguments to compute asymptotic complexity
+    BigO complexity;
+    BigOFunc* complexity_lambda;
+    int complexity_n;
+
+    // Inform print function whether the current run is a complexity report
+    bool report_big_o;
+    bool report_rms;
+  };
+
+  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
+  // and the error stream set to 'std::cerr'
+  BenchmarkReporter();
+
+  // Called once for every suite of benchmarks run.
+  // The parameter "context" contains information that the
+  // reporter may wish to use when generating its report, for example the
+  // platform under which the benchmarks are running. The benchmark run is
+  // never started if this function returns false, allowing the reporter
+  // to skip runs based on the context information.
+  virtual bool ReportContext(const Context& context) = 0;
+
+  // Called once for each group of benchmark runs, gives information about
+  // cpu-time and heap memory usage during the benchmark run. If the group
+  // of runs contained more than two entries then 'report' contains additional
+  // elements representing the mean and standard deviation of those runs.
+  // Additionally if this group of runs was the last in a family of benchmarks
+  // 'reports' contains additional entries representing the asymptotic
+  // complexity and RMS of that benchmark family.
+  virtual void ReportRuns(const std::vector<Run>& report) = 0;
+
+  // Called once and only once after ever group of benchmarks is run and
+  // reported.
+  virtual void Finalize() {}
+
+  // REQUIRES: The object referenced by 'out' is valid for the lifetime
+  // of the reporter.
+  void SetOutputStream(std::ostream* out) {
+    assert(out);
+    output_stream_ = out;
+  }
+
+  // REQUIRES: The object referenced by 'err' is valid for the lifetime
+  // of the reporter.
+  void SetErrorStream(std::ostream* err) {
+    assert(err);
+    error_stream_ = err;
+  }
+
+  std::ostream& GetOutputStream() const {
+    return *output_stream_;
+  }
+
+  std::ostream& GetErrorStream() const {
+    return *error_stream_;
+  }
+
+  virtual ~BenchmarkReporter();
+
+  // Write a human readable string to 'out' representing the specified
+  // 'context'.
+  // REQUIRES: 'out' is non-null.
+  static void PrintBasicContext(std::ostream* out, Context const& context);
+
+ private:
+  std::ostream* output_stream_;
+  std::ostream* error_stream_;
+};
+
+// Simple reporter that outputs benchmark data to the console. This is the
+// default reporter used by RunSpecifiedBenchmarks().
+class ConsoleReporter : public BenchmarkReporter {
+ public:
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+
+ protected:
+  virtual void PrintRunData(const Run& report);
+
+  size_t name_field_width_;
+};
+
+class JSONReporter : public BenchmarkReporter {
+ public:
+  JSONReporter() : first_report_(true) {}
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+  virtual void Finalize();
+
+ private:
+  void PrintRunData(const Run& report);
+
+  bool first_report_;
+};
+
+class CSVReporter : public BenchmarkReporter {
+ public:
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+
+ private:
+  void PrintRunData(const Run& report);
+};
+
+inline const char* GetTimeUnitString(TimeUnit unit) {
+  switch (unit) {
+    case kMillisecond:
+      return "ms";
+    case kMicrosecond:
+      return "us";
+    case kNanosecond:
+    default:
+      return "ns";
+  }
+}
+
+inline double GetTimeUnitMultiplier(TimeUnit unit) {
+  switch (unit) {
+    case kMillisecond:
+      return 1e3;
+    case kMicrosecond:
+      return 1e6;
+    case kNanosecond:
+    default:
+      return 1e9;
+  }
+}
+
+}  // end namespace benchmark
+#endif  // BENCHMARK_REPORTER_H_
Index: utils/google-benchmark/mingw.py
===================================================================
--- /dev/null
+++ utils/google-benchmark/mingw.py
@@ -0,0 +1,320 @@
+#! /usr/bin/env python
+# encoding: utf-8
+
+import argparse
+import errno
+import logging
+import os
+import platform
+import re
+import sys
+import subprocess
+import tempfile
+
+try:
+    import winreg
+except ImportError:
+    import _winreg as winreg
+try:
+    import urllib.request as request
+except ImportError:
+    import urllib as request
+try:
+    import urllib.parse as parse
+except ImportError:
+    import urlparse as parse
+
+class EmptyLogger(object):
+    '''
+    Provides an implementation that performs no logging
+    '''
+    def debug(self, *k, **kw):
+        pass
+    def info(self, *k, **kw):
+        pass
+    def warn(self, *k, **kw):
+        pass
+    def error(self, *k, **kw):
+        pass
+    def critical(self, *k, **kw):
+        pass
+    def setLevel(self, *k, **kw):
+        pass
+
+urls = (
+    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
+        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
+        'repository.txt',
+    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
+        'repository.txt'
+)
+'''
+A list of mingw-build repositories
+'''
+
+def repository(urls = urls, log = EmptyLogger()):
+    '''
+    Downloads and parse mingw-build repository files and parses them
+    '''
+    log.info('getting mingw-builds repository')
+    versions = {}
+    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
+    re_sub = r'http://downloads.sourceforge.net/project/\1'
+    for url in urls:
+        log.debug(' - requesting: %s', url)
+        socket = request.urlopen(url)
+        repo = socket.read()
+        if not isinstance(repo, str):
+            repo = repo.decode();
+        socket.close()
+        for entry in repo.split('\n')[:-1]:
+            value = entry.split('|')
+            version = tuple([int(n) for n in value[0].strip().split('.')])
+            version = versions.setdefault(version, {})
+            arch = value[1].strip()
+            if arch == 'x32':
+                arch = 'i686'
+            elif arch == 'x64':
+                arch = 'x86_64'
+            arch = version.setdefault(arch, {})
+            threading = arch.setdefault(value[2].strip(), {})
+            exceptions = threading.setdefault(value[3].strip(), {})
+            revision = exceptions.setdefault(int(value[4].strip()[3:]),
+                re_sourceforge.sub(re_sub, value[5].strip()))
+    return versions
+
+def find_in_path(file, path=None):
+    '''
+    Attempts to find an executable in the path
+    '''
+    if platform.system() == 'Windows':
+        file += '.exe'
+    if path is None:
+        path = os.environ.get('PATH', '')
+    if type(path) is type(''):
+        path = path.split(os.pathsep)
+    return list(filter(os.path.exists,
+        map(lambda dir, file=file: os.path.join(dir, file), path)))
+
+def find_7zip(log = EmptyLogger()):
+    '''
+    Attempts to find 7zip for unpacking the mingw-build archives
+    '''
+    log.info('finding 7zip')
+    path = find_in_path('7z')
+    if not path:
+        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
+        path, _ = winreg.QueryValueEx(key, 'Path')
+        path = [os.path.join(path, '7z.exe')]
+    log.debug('found \'%s\'', path[0])
+    return path[0]
+
+find_7zip()
+
+def unpack(archive, location, log = EmptyLogger()):
+    '''
+    Unpacks a mingw-builds archive
+    '''
+    sevenzip = find_7zip(log)
+    log.info('unpacking %s', os.path.basename(archive))
+    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
+    log.debug(' - %r', cmd)
+    with open(os.devnull, 'w') as devnull:
+        subprocess.check_call(cmd, stdout = devnull)
+
+def download(url, location, log = EmptyLogger()):
+    '''
+    Downloads and unpacks a mingw-builds archive
+    '''
+    log.info('downloading MinGW')
+    log.debug(' - url: %s', url)
+    log.debug(' - location: %s', location)
+
+    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
+
+    stream = request.urlopen(url)
+    try:
+        content = stream.getheader('Content-Disposition') or ''
+    except AttributeError:
+        content = stream.headers.getheader('Content-Disposition') or ''
+    matches = re_content.match(content)
+    if matches:
+        filename = matches.group(2)
+    else:
+        parsed = parse.urlparse(stream.geturl())
+        filename = os.path.basename(parsed.path)
+
+    try:
+        os.makedirs(location)
+    except OSError as e:
+        if e.errno == errno.EEXIST and os.path.isdir(location):
+            pass
+        else:
+            raise
+
+    archive = os.path.join(location, filename)
+    with open(archive, 'wb') as out:
+        while True:
+            buf = stream.read(1024)
+            if not buf:
+                break
+            out.write(buf)
+    unpack(archive, location, log = log)
+    os.remove(archive)
+
+    possible = os.path.join(location, 'mingw64')
+    if not os.path.exists(possible):
+        possible = os.path.join(location, 'mingw32')
+        if not os.path.exists(possible):
+            raise ValueError('Failed to find unpacked MinGW: ' + possible)
+    return possible
+
+def root(location = None, arch = None, version = None, threading = None,
+        exceptions = None, revision = None, log = EmptyLogger()):
+    '''
+    Returns the root folder of a specific version of the mingw-builds variant
+    of gcc. Will download the compiler if needed
+    '''
+
+    # Get the repository if we don't have all the information
+    if not (arch and version and threading and exceptions and revision):
+        versions = repository(log = log)
+
+    # Determine some defaults
+    version = version or max(versions.keys())
+    if not arch:
+        arch = platform.machine().lower()
+        if arch == 'x86':
+            arch = 'i686'
+        elif arch == 'amd64':
+            arch = 'x86_64'
+    if not threading:
+        keys = versions[version][arch].keys()
+        if 'posix' in keys:
+            threading = 'posix'
+        elif 'win32' in keys:
+            threading = 'win32'
+        else:
+            threading = keys[0]
+    if not exceptions:
+        keys = versions[version][arch][threading].keys()
+        if 'seh' in keys:
+            exceptions = 'seh'
+        elif 'sjlj' in keys:
+            exceptions = 'sjlj'
+        else:
+            exceptions = keys[0]
+    if revision == None:
+        revision = max(versions[version][arch][threading][exceptions].keys())
+    if not location:
+        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
+
+    # Get the download url
+    url = versions[version][arch][threading][exceptions][revision]
+
+    # Tell the user whatzzup
+    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
+    log.debug(' - arch: %s', arch)
+    log.debug(' - threading: %s', threading)
+    log.debug(' - exceptions: %s', exceptions)
+    log.debug(' - revision: %s', revision)
+    log.debug(' - url: %s', url)
+
+    # Store each specific revision differently
+    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
+    slug = slug.format(
+        version = '.'.join(str(v) for v in version),
+        arch = arch,
+        threading = threading,
+        exceptions = exceptions,
+        revision = revision
+    )
+    if arch == 'x86_64':
+        root_dir = os.path.join(location, slug, 'mingw64')
+    elif arch == 'i686':
+        root_dir = os.path.join(location, slug, 'mingw32')
+    else:
+        raise ValueError('Unknown MinGW arch: ' + arch)
+
+    # Download if needed
+    if not os.path.exists(root_dir):
+        downloaded = download(url, os.path.join(location, slug), log = log)
+        if downloaded != root_dir:
+            raise ValueError('The location of mingw did not match\n%s\n%s'
+                % (downloaded, root_dir))
+
+    return root_dir
+
+def str2ver(string):
+    '''
+    Converts a version string into a tuple
+    '''
+    try:
+        version = tuple(int(v) for v in string.split('.'))
+        if len(version) is not 3:
+            raise ValueError()
+    except ValueError:
+        raise argparse.ArgumentTypeError(
+            'please provide a three digit version string')
+    return version
+
+def main():
+    '''
+    Invoked when the script is run directly by the python interpreter
+    '''
+    parser = argparse.ArgumentParser(
+        description = 'Downloads a specific version of MinGW',
+        formatter_class = argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('--location',
+        help = 'the location to download the compiler to',
+        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
+    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
+        help = 'the target MinGW architecture string')
+    parser.add_argument('--version', type = str2ver,
+        help = 'the version of GCC to download')
+    parser.add_argument('--threading', choices = ['posix', 'win32'],
+        help = 'the threading type of the compiler')
+    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
+        help = 'the method to throw exceptions')
+    parser.add_argument('--revision', type=int,
+        help = 'the revision of the MinGW release')
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-v', '--verbose', action='store_true',
+        help='increase the script output verbosity')
+    group.add_argument('-q', '--quiet', action='store_true',
+        help='only print errors and warning')
+    args = parser.parse_args()
+
+    # Create the logger
+    logger = logging.getLogger('mingw')
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter('%(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    if args.quiet:
+        logger.setLevel(logging.WARN)
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+    # Get MinGW
+    root_dir = root(location = args.location, arch = args.arch,
+        version = args.version, threading = args.threading,
+        exceptions = args.exceptions, revision = args.revision,
+        log = logger)
+
+    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
+
+if __name__ == '__main__':
+    try:
+        main()
+    except IOError as e:
+        sys.stderr.write('IO error: %s\n' % e)
+        sys.exit(1)
+    except OSError as e:
+        sys.stderr.write('OS error: %s\n' % e)
+        sys.exit(1)
+    except KeyboardInterrupt as e:
+        sys.stderr.write('Killed\n')
+        sys.exit(1)
Index: utils/google-benchmark/src/CMakeLists.txt
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/CMakeLists.txt
@@ -0,0 +1,51 @@
+# Allow the source files to find headers in src/
+include_directories(${PROJECT_SOURCE_DIR}/src)
+
+# Define the source files
+set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc"
+                 "console_reporter.cc" "csv_reporter.cc" "json_reporter.cc"
+                 "log.cc" "reporter.cc" "sleep.cc" "string_util.cc"
+                 "sysinfo.cc" "walltime.cc" "complexity.cc")
+# Determine the correct regular expression engine to use
+if(HAVE_STD_REGEX)
+  set(RE_FILES "re_std.cc")
+elseif(HAVE_GNU_POSIX_REGEX)
+  set(RE_FILES "re_posix.cc")
+elseif(HAVE_POSIX_REGEX)
+  set(RE_FILES "re_posix.cc")
+else()
+  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
+endif()
+
+add_library(benchmark ${SOURCE_FILES} ${RE_FILES})
+
+
+set_target_properties(benchmark PROPERTIES
+  OUTPUT_NAME "benchmark"
+  VERSION ${GENERIC_LIB_VERSION}
+  SOVERSION ${GENERIC_LIB_SOVERSION}
+)
+
+# Link threads.
+target_link_libraries(benchmark ${CMAKE_THREAD_LIBS_INIT})
+
+# We need extra libraries on Windows
+if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+  target_link_libraries(benchmark Shlwapi)
+endif()
+
+# Expose public API
+target_include_directories(benchmark PUBLIC ${PROJECT_SOURCE_DIR}/include)
+
+# Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
+install(
+  TARGETS benchmark
+  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION lib
+  RUNTIME DESTINATION bin
+  COMPONENT library)
+
+install(
+  DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+  DESTINATION include
+  FILES_MATCHING PATTERN "*.*h")
Index: utils/google-benchmark/src/arraysize.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/arraysize.h
@@ -0,0 +1,34 @@
+#ifndef BENCHMARK_ARRAYSIZE_H_
+#define BENCHMARK_ARRAYSIZE_H_
+
+#include "internal_macros.h"
+
+namespace benchmark {
+namespace internal {
+// The arraysize(arr) macro returns the # of elements in an array arr.
+// The expression is a compile-time constant, and therefore can be
+// used in defining new arrays, for example.  If you use arraysize on
+// a pointer by mistake, you will get a compile-time error.
+//
+
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+
+// That gcc wants both of these prototypes seems mysterious. VC, for
+// its part, can't decide which to use (another mystery). Matching of
+// template overloads: the final frontier.
+#ifndef COMPILER_MSVC
+template <typename T, size_t N>
+char (&ArraySizeHelper(const T (&array)[N]))[N];
+#endif
+
+#define arraysize(array) (sizeof(::benchmark::internal::ArraySizeHelper(array)))
+
+} // end namespace internal
+} // end namespace benchmark
+
+#endif // BENCHMARK_ARRAYSIZE_H_
Index: utils/google-benchmark/src/benchmark.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/benchmark.cc
@@ -0,0 +1,1123 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#endif
+
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <iostream>
+#include <memory>
+#include <thread>
+
+#include "check.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "stat.h"
+#include "string_util.h"
+#include "sysinfo.h"
+#include "walltime.h"
+
+DEFINE_bool(benchmark_list_tests, false,
+            "Print a list of benchmarks. This option overrides all other "
+            "options.");
+
+DEFINE_string(benchmark_filter, ".",
+              "A regular expression that specifies the set of benchmarks "
+              "to execute.  If this flag is empty, no benchmarks are run.  "
+              "If this flag is the string \"all\", all benchmarks linked "
+              "into the process are run.");
+
+DEFINE_double(benchmark_min_time, 0.5,
+              "Minimum number of seconds we should run benchmark before "
+              "results are considered significant.  For cpu-time based "
+              "tests, this is the lower bound on the total cpu time "
+              "used by all threads that make up the test.  For real-time "
+              "based tests, this is the lower bound on the elapsed time "
+              "of the benchmark execution, regardless of number of "
+              "threads.");
+
+DEFINE_int32(benchmark_repetitions, 1,
+             "The number of runs of each benchmark. If greater than 1, the "
+             "mean and standard deviation of the runs will be reported.");
+
+DEFINE_string(benchmark_format, "console",
+              "The format to use for console output. Valid values are "
+              "'console', 'json', or 'csv'.");
+
+DEFINE_bool(color_print, true, "Enables colorized logging.");
+
+DEFINE_int32(v, 0, "The level of verbose logging to output");
+
+
+namespace benchmark {
+
+namespace internal {
+
+void UseCharPointer(char const volatile*) {}
+
+// NOTE: This is a dummy "mutex" type used to denote the actual mutex
+// returned by GetBenchmarkLock(). This is only used to placate the thread
+// safety warnings by giving the return of GetBenchmarkLock() a name.
+struct CAPABILITY("mutex") BenchmarkLockType {};
+BenchmarkLockType BenchmarkLockVar;
+
+} // end namespace internal
+
+inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar)
+GetBenchmarkLock()
+{
+  static Mutex lock;
+  return lock;
+}
+
+namespace {
+
+bool IsZero(double n) {
+    return std::abs(n) < std::numeric_limits<double>::epsilon();
+}
+
+// For non-dense Range, intermediate values are powers of kRangeMultiplier.
+static const int kRangeMultiplier = 8;
+static const size_t kMaxIterations = 1000000000;
+
+bool running_benchmark = false;
+
+// Global variable so that a benchmark can cause a little extra printing
+std::string* GetReportLabel() {
+    static std::string label GUARDED_BY(GetBenchmarkLock());
+    return &label;
+}
+
+// Global variable so that a benchmark can report an error as a human readable
+// string. If error_message is null no error occurred.
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+typedef char* error_message_type;
+#else
+typedef const char* error_message_type;
+#endif
+
+static std::atomic<error_message_type> error_message = ATOMIC_VAR_INIT(nullptr);
+
+// TODO(ericwf): support MallocCounter.
+//static benchmark::MallocCounter *benchmark_mc;
+
+struct ThreadStats {
+    ThreadStats() : bytes_processed(0), items_processed(0), complexity_n(0) {}
+    int64_t bytes_processed;
+    int64_t items_processed;
+    int  complexity_n;
+};
+
+// Timer management class
+class TimerManager {
+ public:
+  TimerManager(int num_threads, Notification* done)
+      : num_threads_(num_threads),
+        running_threads_(num_threads),
+        done_(done),
+        running_(false),
+        real_time_used_(0),
+        cpu_time_used_(0),
+        manual_time_used_(0),
+        num_finalized_(0),
+        phase_number_(0),
+        entered_(0)
+  {
+  }
+
+  // Called by each thread
+  void StartTimer() EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        CHECK(!running_) << "Called StartTimer when timer is already running";
+        running_ = true;
+        start_real_time_ = walltime::Now();
+        start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage();
+       }
+     }
+     if (last_thread) {
+       phase_condition_.notify_all();
+     }
+  }
+
+  // Called by each thread
+  void StopTimer() EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        CHECK(running_) << "Called StopTimer when timer is already stopped";
+        InternalStop();
+      }
+    }
+    if (last_thread) {
+      phase_condition_.notify_all();
+    }
+  }
+
+  // Called by each thread
+  void SetIterationTime(double seconds) EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        manual_time_used_ += seconds;
+      }
+    }
+    if (last_thread) {
+      phase_condition_.notify_all();
+    }
+  }
+
+  // Called by each thread
+  void Finalize() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    num_finalized_++;
+    if (num_finalized_ == num_threads_) {
+      CHECK(!running_) <<
+        "The timer should be stopped before the timer is finalized";
+      done_->Notify();
+    }
+  }
+
+  void RemoveErroredThread() EXCLUDES(lock_) {
+    MutexLock ml(lock_);
+    int last_thread = --running_threads_ == 0;
+    if (last_thread && running_)
+      InternalStop();
+    else if (!last_thread)
+      phase_condition_.notify_all();
+  }
+
+  // REQUIRES: timer is not running
+  double real_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return real_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double cpu_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return cpu_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double manual_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return manual_time_used_;
+  }
+
+ private:
+  Mutex lock_;
+  Condition phase_condition_;
+  int num_threads_;
+  int running_threads_;
+  Notification* done_;
+
+  bool running_;                // Is the timer running
+  double start_real_time_;      // If running_
+  double start_cpu_time_;       // If running_
+
+  // Accumulated time so far (does not contain current slice if running_)
+  double real_time_used_;
+  double cpu_time_used_;
+  // Manually set iteration time. User sets this with SetIterationTime(seconds).
+  double manual_time_used_;
+
+  // How many threads have called Finalize()
+  int num_finalized_;
+
+  // State for barrier management
+  int phase_number_;
+  int entered_;         // Number of threads that have entered this barrier
+
+  void InternalStop() REQUIRES(lock_) {
+    CHECK(running_);
+    running_ = false;
+    real_time_used_ += walltime::Now() - start_real_time_;
+    cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage())
+                       - start_cpu_time_);
+  }
+
+  // Enter the barrier and wait until all other threads have also
+  // entered the barrier.  Returns iff this is the last thread to
+  // enter the barrier.
+  bool Barrier(MutexLock& ml) REQUIRES(lock_) {
+    CHECK_LT(entered_, running_threads_);
+    entered_++;
+    if (entered_ < running_threads_) {
+      // Wait for all threads to enter
+      int phase_number_cp = phase_number_;
+      auto cb = [this, phase_number_cp]() {
+        return this->phase_number_ > phase_number_cp ||
+               entered_ == running_threads_; // A thread has aborted in error
+      };
+      phase_condition_.wait(ml.native_handle(), cb);
+      if (phase_number_ > phase_number_cp)
+        return false;
+      // else (running_threads_ == entered_) and we are the last thread.
+    }
+    // Last thread has reached the barrier
+    phase_number_++;
+    entered_ = 0;
+    return true;
+  }
+};
+
+// TimerManager for current run.
+static std::unique_ptr<TimerManager> timer_manager = nullptr;
+
+} // end namespace
+
+namespace internal {
+
+// Information kept per benchmark we may want to run
+struct Benchmark::Instance {
+  std::string    name;
+  Benchmark*     benchmark;
+  bool           has_arg1;
+  int            arg1;
+  bool           has_arg2;
+  int            arg2;
+  TimeUnit       time_unit;
+  int            range_multiplier;
+  bool           use_real_time;
+  bool           use_manual_time;
+  BigO           complexity;
+  BigOFunc*      complexity_lambda;
+  bool           last_benchmark_instance;
+  int            repetitions;
+  double         min_time;
+  int            threads;    // Number of concurrent threads to use
+  bool           multithreaded;  // Is benchmark multi-threaded?
+};
+
+// Class for managing registered benchmarks.  Note that each registered
+// benchmark identifies a family of related benchmarks to run.
+class BenchmarkFamilies {
+ public:
+  static BenchmarkFamilies* GetInstance();
+
+  // Registers a benchmark family and returns the index assigned to it.
+  size_t AddBenchmark(std::unique_ptr<Benchmark> family);
+
+  // Extract the list of benchmark instances that match the specified
+  // regular expression.
+  bool FindBenchmarks(const std::string& re,
+                      std::vector<Benchmark::Instance>* benchmarks);
+ private:
+  BenchmarkFamilies() {}
+
+  std::vector<std::unique_ptr<Benchmark>> families_;
+  Mutex mutex_;
+};
+
+
+class BenchmarkImp {
+public:
+  explicit BenchmarkImp(const char* name);
+  ~BenchmarkImp();
+
+  void Arg(int x);
+  void Unit(TimeUnit unit);
+  void Range(int start, int limit);
+  void DenseRange(int start, int limit);
+  void ArgPair(int start, int limit);
+  void RangePair(int lo1, int hi1, int lo2, int hi2);
+  void RangeMultiplier(int multiplier);
+  void MinTime(double n);
+  void Repetitions(int n);
+  void UseRealTime();
+  void UseManualTime();
+  void Complexity(BigO complexity);
+  void ComplexityLambda(BigOFunc* complexity);
+  void Threads(int t);
+  void ThreadRange(int min_threads, int max_threads);
+  void ThreadPerCpu();
+  void SetName(const char* name);
+
+  static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
+
+private:
+  friend class BenchmarkFamilies;
+
+  std::string name_;
+  int arg_count_;
+  std::vector< std::pair<int, int> > args_;  // Args for all benchmark runs
+  TimeUnit time_unit_;
+  int range_multiplier_;
+  double min_time_;
+  int repetitions_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  std::vector<int> thread_counts_;
+
+  BenchmarkImp& operator=(BenchmarkImp const&);
+};
+
+BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
+  static BenchmarkFamilies instance;
+  return &instance;
+}
+
+
+size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) {
+  MutexLock l(mutex_);
+  size_t index = families_.size();
+  families_.push_back(std::move(family));
+  return index;
+}
+
+bool BenchmarkFamilies::FindBenchmarks(
+    const std::string& spec,
+    std::vector<Benchmark::Instance>* benchmarks) {
+  // Make regular expression out of command-line flag
+  std::string error_msg;
+  Regex re;
+  if (!re.Init(spec, &error_msg)) {
+    std::cerr << "Could not compile benchmark re: " << error_msg << std::endl;
+    return false;
+  }
+
+  // Special list of thread counts to use when none are specified
+  std::vector<int> one_thread;
+  one_thread.push_back(1);
+
+  MutexLock l(mutex_);
+  for (std::unique_ptr<Benchmark>& bench_family : families_) {
+    // Family was deleted or benchmark doesn't match
+    if (!bench_family) continue;
+    BenchmarkImp* family = bench_family->imp_;
+
+    if (family->arg_count_ == -1) {
+      family->arg_count_ = 0;
+      family->args_.emplace_back(-1, -1);
+    }
+    for (auto const& args : family->args_) {
+      const std::vector<int>* thread_counts =
+        (family->thread_counts_.empty()
+         ? &one_thread
+         : &family->thread_counts_);
+      for (int num_threads : *thread_counts) {
+
+        Benchmark::Instance instance;
+        instance.name = family->name_;
+        instance.benchmark = bench_family.get();
+        instance.has_arg1 = family->arg_count_ >= 1;
+        instance.arg1 = args.first;
+        instance.has_arg2 = family->arg_count_ == 2;
+        instance.arg2 = args.second;
+        instance.time_unit = family->time_unit_;
+        instance.range_multiplier = family->range_multiplier_;
+        instance.min_time = family->min_time_;
+        instance.repetitions = family->repetitions_;
+        instance.use_real_time = family->use_real_time_;
+        instance.use_manual_time = family->use_manual_time_;
+        instance.complexity = family->complexity_;
+        instance.complexity_lambda = family->complexity_lambda_;
+        instance.threads = num_threads;
+        instance.multithreaded = !(family->thread_counts_.empty());
+
+        // Add arguments to instance name
+        if (family->arg_count_ >= 1) {
+          AppendHumanReadable(instance.arg1, &instance.name);
+        }
+        if (family->arg_count_ >= 2) {
+          AppendHumanReadable(instance.arg2, &instance.name);
+        }
+        if (!IsZero(family->min_time_)) {
+          instance.name +=  StringPrintF("/min_time:%0.3f",  family->min_time_);
+        }
+        if (family->repetitions_ != 0) {
+          instance.name +=  StringPrintF("/repeats:%d",  family->repetitions_);
+        }
+        if (family->use_manual_time_) {
+          instance.name +=  "/manual_time";
+        } else if (family->use_real_time_) {
+          instance.name +=  "/real_time";
+        }
+
+        // Add the number of threads used to the name
+        if (!family->thread_counts_.empty()) {
+          instance.name += StringPrintF("/threads:%d", instance.threads);
+        }
+
+        if (re.Match(instance.name)) {
+          instance.last_benchmark_instance = (args == family->args_.back());
+          benchmarks->push_back(instance);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+BenchmarkImp::BenchmarkImp(const char* name)
+    : name_(name), arg_count_(-1), time_unit_(kNanosecond),
+      range_multiplier_(kRangeMultiplier), min_time_(0.0), repetitions_(0),
+      use_real_time_(false), use_manual_time_(false),
+      complexity_(oNone) {
+}
+
+BenchmarkImp::~BenchmarkImp() {
+}
+
+void BenchmarkImp::Arg(int x) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
+  args_.emplace_back(x, -1);
+}
+
+void BenchmarkImp::Unit(TimeUnit unit) {
+  time_unit_ = unit;
+}
+
+void BenchmarkImp::Range(int start, int limit) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
+  std::vector<int> arglist;
+  AddRange(&arglist, start, limit, range_multiplier_);
+
+  for (int i : arglist) {
+    args_.emplace_back(i, -1);
+  }
+}
+
+void BenchmarkImp::DenseRange(int start, int limit) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
+  CHECK_GE(start, 0);
+  CHECK_LE(start, limit);
+  for (int arg = start; arg <= limit; arg++) {
+    args_.emplace_back(arg, -1);
+  }
+}
+
+void BenchmarkImp::ArgPair(int x, int y) {
+  CHECK(arg_count_ == -1 || arg_count_ == 2);
+  arg_count_ = 2;
+  args_.emplace_back(x, y);
+}
+
+void BenchmarkImp::RangePair(int lo1, int hi1, int lo2, int hi2) {
+  CHECK(arg_count_ == -1 || arg_count_ == 2);
+  arg_count_ = 2;
+  std::vector<int> arglist1, arglist2;
+  AddRange(&arglist1, lo1, hi1, range_multiplier_);
+  AddRange(&arglist2, lo2, hi2, range_multiplier_);
+
+  for (int i : arglist1) {
+    for (int j : arglist2) {
+      args_.emplace_back(i, j);
+    }
+  }
+}
+
+void BenchmarkImp::RangeMultiplier(int multiplier) {
+  CHECK(multiplier > 1);
+  range_multiplier_ = multiplier;
+}
+
+void BenchmarkImp::MinTime(double t) {
+  CHECK(t > 0.0);
+  min_time_ = t;
+}
+
+
+void BenchmarkImp::Repetitions(int n) {
+  CHECK(n > 0);
+  repetitions_ = n;
+}
+
+void BenchmarkImp::UseRealTime() {
+  CHECK(!use_manual_time_) << "Cannot set UseRealTime and UseManualTime simultaneously.";
+  use_real_time_ = true;
+}
+
+void BenchmarkImp::UseManualTime() {
+  CHECK(!use_real_time_) << "Cannot set UseRealTime and UseManualTime simultaneously.";
+  use_manual_time_ = true;
+}
+
+void BenchmarkImp::Complexity(BigO complexity){
+  complexity_ = complexity;
+}
+
+void BenchmarkImp::ComplexityLambda(BigOFunc* complexity) {
+  complexity_lambda_ = complexity;
+}
+
+void BenchmarkImp::Threads(int t) {
+  CHECK_GT(t, 0);
+  thread_counts_.push_back(t);
+}
+
+void BenchmarkImp::ThreadRange(int min_threads, int max_threads) {
+  CHECK_GT(min_threads, 0);
+  CHECK_GE(max_threads, min_threads);
+
+  AddRange(&thread_counts_, min_threads, max_threads, 2);
+}
+
+void BenchmarkImp::ThreadPerCpu() {
+  static int num_cpus = NumCPUs();
+  thread_counts_.push_back(num_cpus);
+}
+
+void BenchmarkImp::SetName(const char* name) {
+  name_ = name;
+}
+
+void BenchmarkImp::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
+  CHECK_GE(lo, 0);
+  CHECK_GE(hi, lo);
+  CHECK_GE(mult, 2);
+
+  // Add "lo"
+  dst->push_back(lo);
+
+  static const int kint32max = std::numeric_limits<int32_t>::max();
+
+  // Now space out the benchmarks in multiples of "mult"
+  for (int32_t i = 1; i < kint32max/mult; i *= mult) {
+    if (i >= hi) break;
+    if (i > lo) {
+      dst->push_back(i);
+    }
+  }
+  // Add "hi" (if different from "lo")
+  if (hi != lo) {
+    dst->push_back(hi);
+  }
+}
+
+Benchmark::Benchmark(const char* name)
+    : imp_(new BenchmarkImp(name))
+{
+}
+
+Benchmark::~Benchmark()  {
+  delete imp_;
+}
+
+Benchmark::Benchmark(Benchmark const& other)
+  : imp_(new BenchmarkImp(*other.imp_))
+{
+}
+
+Benchmark* Benchmark::Arg(int x) {
+  imp_->Arg(x);
+  return this;
+}
+
+Benchmark* Benchmark::Unit(TimeUnit unit) {
+  imp_->Unit(unit);
+  return this;
+}
+
+Benchmark* Benchmark::Range(int start, int limit) {
+  imp_->Range(start, limit);
+  return this;
+}
+
+Benchmark* Benchmark::DenseRange(int start, int limit) {
+  imp_->DenseRange(start, limit);
+  return this;
+}
+
+Benchmark* Benchmark::ArgPair(int x, int y) {
+  imp_->ArgPair(x, y);
+  return this;
+}
+
+Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
+  imp_->RangePair(lo1, hi1, lo2, hi2);
+  return this;
+}
+
+Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
+  custom_arguments(this);
+  return this;
+}
+
+Benchmark* Benchmark::RangeMultiplier(int multiplier) {
+  imp_->RangeMultiplier(multiplier);
+  return this;
+}
+
+
+Benchmark* Benchmark::Repetitions(int t) {
+  imp_->Repetitions(t);
+  return this;
+}
+
+Benchmark* Benchmark::MinTime(double t) {
+  imp_->MinTime(t);
+  return this;
+}
+
+Benchmark* Benchmark::UseRealTime() {
+  imp_->UseRealTime();
+  return this;
+}
+
+Benchmark* Benchmark::UseManualTime() {
+  imp_->UseManualTime();
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigO complexity) {
+  imp_->Complexity(complexity);
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
+  imp_->Complexity(oLambda);
+  imp_->ComplexityLambda(complexity);
+  return this;
+}
+
+Benchmark* Benchmark::Threads(int t) {
+  imp_->Threads(t);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
+  imp_->ThreadRange(min_threads, max_threads);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadPerCpu() {
+  imp_->ThreadPerCpu();
+  return this;
+}
+
+void Benchmark::SetName(const char* name) {
+  imp_->SetName(name);
+}
+
+void FunctionBenchmark::Run(State& st) {
+  func_(st);
+}
+
+} // end namespace internal
+
+namespace {
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into *total.
+void RunInThread(const benchmark::internal::Benchmark::Instance* b,
+                 size_t iters, int thread_id,
+                 ThreadStats* total) EXCLUDES(GetBenchmarkLock()) {
+  State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id, b->threads);
+  b->benchmark->Run(st);
+  CHECK(st.iterations() == st.max_iterations) <<
+    "Benchmark returned before State::KeepRunning() returned false!";
+  {
+    MutexLock l(GetBenchmarkLock());
+    total->bytes_processed += st.bytes_processed();
+    total->items_processed += st.items_processed();
+    total->complexity_n += st.complexity_length_n();
+  }
+
+  timer_manager->Finalize();
+}
+
+void RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
+                  BenchmarkReporter* br,
+                  std::vector<BenchmarkReporter::Run>& complexity_reports)
+  EXCLUDES(GetBenchmarkLock()) {
+  size_t iters = 1;
+
+  std::vector<BenchmarkReporter::Run> reports;
+
+  std::vector<std::thread> pool;
+  if (b.multithreaded)
+    pool.resize(b.threads);
+
+  const int repeats = b.repetitions != 0 ? b.repetitions
+                                         : FLAGS_benchmark_repetitions;
+  for (int i = 0; i < repeats; i++) {
+    std::string mem;
+    for (;;) {
+      // Try benchmark
+      VLOG(2) << "Running " << b.name << " for " << iters << "\n";
+
+      {
+        MutexLock l(GetBenchmarkLock());
+        GetReportLabel()->clear();
+      }
+      error_message = nullptr;
+
+      Notification done;
+      timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));
+
+      ThreadStats total;
+      running_benchmark = true;
+      if (b.multithreaded) {
+        // If this is out first iteration of the while(true) loop then the
+        // threads haven't been started and can't be joined. Otherwise we need
+        // to join the thread before replacing them.
+        for (std::thread& thread : pool) {
+          if (thread.joinable())
+            thread.join();
+        }
+        for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+            pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti), &total);
+        }
+      } else {
+        // Run directly in this thread
+        RunInThread(&b, iters, 0, &total);
+      }
+      done.WaitForNotification();
+      running_benchmark = false;
+
+      const double cpu_accumulated_time = timer_manager->cpu_time_used();
+      const double real_accumulated_time = timer_manager->real_time_used();
+      const double manual_accumulated_time = timer_manager->manual_time_used();
+      timer_manager.reset();
+
+      VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
+              << real_accumulated_time << "\n";
+
+      // Base decisions off of real time if requested by this benchmark.
+      double seconds = cpu_accumulated_time;
+      if (b.use_manual_time) {
+          seconds = manual_accumulated_time;
+      } else if (b.use_real_time) {
+          seconds = real_accumulated_time;
+      }
+
+      std::string label;
+      {
+        MutexLock l(GetBenchmarkLock());
+        label = *GetReportLabel();
+      }
+      error_message_type error_msg = error_message;
+
+      const double min_time = !IsZero(b.min_time) ? b.min_time
+                                                  : FLAGS_benchmark_min_time;
+
+      // If this was the first run, was elapsed time or cpu time large enough?
+      // If this is not the first run, go with the current value of iter.
+      if ((i > 0) || (error_msg != nullptr) ||
+          (iters >= kMaxIterations) ||
+          (seconds >= min_time) ||
+          (real_accumulated_time >= 5*min_time)) {
+
+        // Create report about this benchmark run.
+        BenchmarkReporter::Run report;
+        report.benchmark_name = b.name;
+        report.error_occurred = error_msg != nullptr;
+        report.error_message = error_msg != nullptr ? error_msg : "";
+        report.report_label = label;
+        // Report the total iterations across all threads.
+        report.iterations = static_cast<int64_t>(iters) * b.threads;
+        report.time_unit = b.time_unit;
+
+        if (!report.error_occurred) {
+          double bytes_per_second = 0;
+          if (total.bytes_processed > 0 && seconds > 0.0) {
+            bytes_per_second = (total.bytes_processed / seconds);
+          }
+          double items_per_second = 0;
+          if (total.items_processed > 0 && seconds > 0.0) {
+            items_per_second = (total.items_processed / seconds);
+          }
+
+          if (b.use_manual_time) {
+            report.real_accumulated_time = manual_accumulated_time;
+          } else {
+            report.real_accumulated_time = real_accumulated_time;
+          }
+          report.cpu_accumulated_time = cpu_accumulated_time;
+          report.bytes_per_second = bytes_per_second;
+          report.items_per_second = items_per_second;
+          report.complexity_n = total.complexity_n;
+          report.complexity = b.complexity;
+          report.complexity_lambda = b.complexity_lambda;
+          if(report.complexity != oNone)
+            complexity_reports.push_back(report);
+        }
+
+        reports.push_back(report);
+        break;
+      }
+
+      // See how much iterations should be increased by
+      // Note: Avoid division by zero with max(seconds, 1ns).
+      double multiplier = min_time * 1.4 / std::max(seconds, 1e-9);
+      // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+      // use the multiplier directly. Otherwise we use at most 10 times
+      // expansion.
+      // NOTE: When the last run was at least 10% of the min time the max
+      // expansion should be 14x.
+      bool is_significant = (seconds / min_time) > 0.1;
+      multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+      if (multiplier <= 1.0) multiplier = 2.0;
+      double next_iters = std::max(multiplier * iters, iters + 1.0);
+      if (next_iters > kMaxIterations) {
+        next_iters = kMaxIterations;
+      }
+      VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+      iters = static_cast<int>(next_iters + 0.5);
+    }
+  }
+  std::vector<BenchmarkReporter::Run> additional_run_stats = ComputeStats(reports);
+  reports.insert(reports.end(), additional_run_stats.begin(),
+                 additional_run_stats.end());
+
+  if((b.complexity != oNone) && b.last_benchmark_instance) {
+    additional_run_stats = ComputeBigO(complexity_reports);
+    reports.insert(reports.end(), additional_run_stats.begin(),
+                   additional_run_stats.end());
+    complexity_reports.clear();
+  }
+
+  br->ReportRuns(reports);
+
+  if (b.multithreaded) {
+    for (std::thread& thread : pool)
+      thread.join();
+  }
+}
+
+}  // namespace
+
+State::State(size_t max_iters, bool has_x, int x, bool has_y, int y,
+             int thread_i, int n_threads)
+    : started_(false), finished_(false), total_iterations_(0),
+      has_range_x_(has_x), range_x_(x),
+      has_range_y_(has_y), range_y_(y),
+      bytes_processed_(0), items_processed_(0),
+      complexity_n_(0),
+      error_occurred_(false),
+      thread_index(thread_i),
+      threads(n_threads),
+      max_iterations(max_iters)
+{
+    CHECK(max_iterations != 0) << "At least one iteration must be run";
+    CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+}
+
+void State::PauseTiming() {
+  // Add in time accumulated so far
+  CHECK(running_benchmark);
+  CHECK(started_ && !finished_ && !error_occurred_);
+  timer_manager->StopTimer();
+}
+
+void State::ResumeTiming() {
+  CHECK(running_benchmark);
+  CHECK(started_ && !finished_ && !error_occurred_);
+  timer_manager->StartTimer();
+}
+
+void State::SkipWithError(const char* msg) {
+  CHECK(msg);
+  error_occurred_ = true;
+  error_message_type expected_no_error_msg = nullptr;
+  error_message.compare_exchange_weak(expected_no_error_msg,
+    const_cast<error_message_type>(msg));
+  started_ = finished_ = true;
+  total_iterations_ = max_iterations;
+  timer_manager->RemoveErroredThread();
+}
+
+void State::SetIterationTime(double seconds)
+{
+  CHECK(running_benchmark);
+  timer_manager->SetIterationTime(seconds);
+}
+
+void State::SetLabel(const char* label) {
+  CHECK(running_benchmark);
+  MutexLock l(GetBenchmarkLock());
+  *GetReportLabel() = label;
+}
+
+namespace internal {
+namespace {
+
+void RunMatchingBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
+                           BenchmarkReporter* reporter) {
+  CHECK(reporter != nullptr);
+
+  // Determine the width of the name field using a minimum width of 10.
+  bool has_repetitions = FLAGS_benchmark_repetitions > 1;
+  size_t name_field_width = 10;
+  for (const Benchmark::Instance& benchmark : benchmarks) {
+    name_field_width =
+        std::max<size_t>(name_field_width, benchmark.name.size());
+    has_repetitions |= benchmark.repetitions > 1;
+  }
+  if (has_repetitions)
+    name_field_width += std::strlen("_stddev");
+
+  // Print header here
+  BenchmarkReporter::Context context;
+  context.num_cpus = NumCPUs();
+  context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;
+
+  context.cpu_scaling_enabled = CpuScalingEnabled();
+  context.name_field_width = name_field_width;
+
+  // Keep track of runing times of all instances of current benchmark
+  std::vector<BenchmarkReporter::Run> complexity_reports;
+
+  if (reporter->ReportContext(context)) {
+    for (const auto& benchmark : benchmarks) {
+      RunBenchmark(benchmark, reporter, complexity_reports);
+    }
+  }
+}
+
+std::unique_ptr<BenchmarkReporter> GetDefaultReporter() {
+  typedef std::unique_ptr<BenchmarkReporter> PtrType;
+  if (FLAGS_benchmark_format == "console") {
+    return PtrType(new ConsoleReporter);
+  } else if (FLAGS_benchmark_format == "json") {
+    return PtrType(new JSONReporter);
+  } else if (FLAGS_benchmark_format == "csv") {
+    return PtrType(new CSVReporter);
+  } else {
+    std::cerr << "Unexpected format: '" << FLAGS_benchmark_format << "'\n";
+    std::exit(1);
+  }
+}
+
+} // end namespace
+} // end namespace internal
+
+size_t RunSpecifiedBenchmarks() {
+  return RunSpecifiedBenchmarks(nullptr);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* reporter) {
+  std::string spec = FLAGS_benchmark_filter;
+  if (spec.empty() || spec == "all")
+    spec = ".";  // Regexp that matches all benchmarks
+
+  std::vector<internal::Benchmark::Instance> benchmarks;
+  auto families = internal::BenchmarkFamilies::GetInstance();
+  if (!families->FindBenchmarks(spec, &benchmarks)) return 0;
+
+  if (FLAGS_benchmark_list_tests) {
+    for (auto const& benchmark : benchmarks)
+      std::cout <<  benchmark.name << "\n";
+  } else {
+    std::unique_ptr<BenchmarkReporter> default_reporter;
+    if (!reporter) {
+      default_reporter = internal::GetDefaultReporter();
+      reporter = default_reporter.get();
+    }
+    internal::RunMatchingBenchmarks(benchmarks, reporter);
+    reporter->Finalize();
+  }
+  return benchmarks.size();
+}
+
+namespace internal {
+
+void PrintUsageAndExit() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_list_tests={true|false}]\n"
+          "          [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_min_time=<min_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_format=<console|json|csv>]\n"
+          "          [--color_print={true|false}]\n"
+          "          [--v=<verbosity>]\n");
+  exit(0);
+}
+
+void ParseCommandLineFlags(int* argc, char** argv) {
+  using namespace benchmark;
+  for (int i = 1; i < *argc; ++i) {
+    if (
+        ParseBoolFlag(argv[i], "benchmark_list_tests",
+                      &FLAGS_benchmark_list_tests) ||
+        ParseStringFlag(argv[i], "benchmark_filter",
+                        &FLAGS_benchmark_filter) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_time",
+                        &FLAGS_benchmark_min_time) ||
+        ParseInt32Flag(argv[i], "benchmark_repetitions",
+                       &FLAGS_benchmark_repetitions) ||
+        ParseStringFlag(argv[i], "benchmark_format",
+                        &FLAGS_benchmark_format) ||
+        ParseBoolFlag(argv[i], "color_print",
+                       &FLAGS_color_print) ||
+        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
+      for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
+
+      --(*argc);
+      --i;
+    } else if (IsFlag(argv[i], "help")) {
+      PrintUsageAndExit();
+    }
+  }
+
+  if (FLAGS_benchmark_format != "console" &&
+      FLAGS_benchmark_format != "json" &&
+      FLAGS_benchmark_format != "csv") {
+    PrintUsageAndExit();
+  }
+}
+
+Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
+    std::unique_ptr<Benchmark> bench_ptr(bench);
+    BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
+    families->AddBenchmark(std::move(bench_ptr));
+    return bench;
+}
+
+} // end namespace internal
+
+void Initialize(int* argc, char** argv) {
+  internal::ParseCommandLineFlags(argc, argv);
+  internal::SetLogLevel(FLAGS_v);
+  // TODO remove this. It prints some output the first time it is called.
+  // We don't want to have this ouput printed during benchmarking.
+  MyCPUUsage();
+  // The first call to walltime::Now initialized it. Call it once to
+  // prevent the initialization from happening in a benchmark.
+  walltime::Now();
+}
+
+} // end namespace benchmark
Index: utils/google-benchmark/src/check.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/check.h
@@ -0,0 +1,72 @@
+#ifndef CHECK_H_
+#define CHECK_H_
+
+#include <cstdlib>
+#include <ostream>
+
+#include "internal_macros.h"
+#include "log.h"
+
+namespace benchmark {
+namespace internal {
+
+typedef void(AbortHandlerT)();
+
+inline AbortHandlerT*& GetAbortHandler() {
+    static AbortHandlerT* handler = &std::abort;
+    return handler;
+}
+
+BENCHMARK_NORETURN inline void CallAbortHandler() {
+    GetAbortHandler()();
+    std::abort(); // fallback to enforce noreturn
+}
+
+// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
+// will log information about the failures and abort when it is destructed.
+class CheckHandler {
+public:
+  CheckHandler(const char* check, const char* file, const char* func, int line)
+    : log_(GetErrorLogInstance())
+  {
+    log_ << file << ":" << line << ": " << func << ": Check `"
+          << check << "' failed. ";
+  }
+
+  std::ostream& GetLog() {
+    return log_;
+  }
+
+  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
+      log_ << std::endl;
+      CallAbortHandler();
+  }
+
+  CheckHandler & operator=(const CheckHandler&) = delete;
+  CheckHandler(const CheckHandler&) = delete;
+  CheckHandler() = delete;
+private:
+  std::ostream& log_;
+};
+
+} // end namespace internal
+} // end namespace benchmark
+
+// The CHECK macro returns a std::ostream object that can have extra information
+// written to it.
+#ifndef NDEBUG
+# define CHECK(b)  (b ? ::benchmark::internal::GetNullLogInstance()        \
+                      : ::benchmark::internal::CheckHandler(               \
+                          #b, __FILE__, __func__, __LINE__).GetLog())
+#else
+# define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#endif
+
+#define CHECK_EQ(a, b) CHECK((a) == (b))
+#define CHECK_NE(a, b) CHECK((a) != (b))
+#define CHECK_GE(a, b) CHECK((a) >= (b))
+#define CHECK_LE(a, b) CHECK((a) <= (b))
+#define CHECK_GT(a, b) CHECK((a) > (b))
+#define CHECK_LT(a, b) CHECK((a) < (b))
+
+#endif  // CHECK_H_
Index: utils/google-benchmark/src/colorprint.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/colorprint.h
@@ -0,0 +1,27 @@
+#ifndef BENCHMARK_COLORPRINT_H_
+#define BENCHMARK_COLORPRINT_H_
+
+#include <cstdarg>
+#include <string>
+#include <iostream>
+
+namespace benchmark {
+enum LogColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW,
+  COLOR_BLUE,
+  COLOR_MAGENTA,
+  COLOR_CYAN,
+  COLOR_WHITE
+};
+
+std::string FormatString(const char* msg, va_list args);
+std::string FormatString(const char* msg, ...);
+
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...);
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_COLORPRINT_H_
Index: utils/google-benchmark/src/colorprint.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/colorprint.cc
@@ -0,0 +1,158 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "colorprint.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <cstdarg>
+#include <string>
+#include <memory>
+
+#include "commandlineflags.h"
+#include "check.h"
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <Windows.h>
+#endif
+
+DECLARE_bool(color_print);
+
+namespace benchmark {
+namespace {
+#ifdef BENCHMARK_OS_WINDOWS
+typedef WORD PlatformColorCode;
+#else
+typedef const char* PlatformColorCode;
+#endif
+
+PlatformColorCode GetPlatformColorCode(LogColor color) {
+#ifdef BENCHMARK_OS_WINDOWS
+  switch (color) {
+    case COLOR_RED:
+      return FOREGROUND_RED;
+    case COLOR_GREEN:
+      return FOREGROUND_GREEN;
+    case COLOR_YELLOW:
+      return FOREGROUND_RED | FOREGROUND_GREEN;
+    case COLOR_BLUE:
+      return FOREGROUND_BLUE;
+    case COLOR_MAGENTA:
+      return FOREGROUND_BLUE | FOREGROUND_RED;
+    case COLOR_CYAN:
+      return FOREGROUND_BLUE | FOREGROUND_GREEN;
+    case COLOR_WHITE:  // fall through to default
+    default:
+      return 0;
+  }
+#else
+  switch (color) {
+    case COLOR_RED:
+      return "1";
+    case COLOR_GREEN:
+      return "2";
+    case COLOR_YELLOW:
+      return "3";
+    case COLOR_BLUE:
+      return "4";
+    case COLOR_MAGENTA:
+      return "5";
+    case COLOR_CYAN:
+      return "6";
+    case COLOR_WHITE:
+      return "7";
+    default:
+      return nullptr;
+  };
+#endif
+}
+
+}  // end namespace
+
+std::string FormatString(const char *msg, va_list args) {
+  // we might need a second shot at this, so pre-emptivly make a copy
+  va_list args_cp;
+  va_copy(args_cp, args);
+
+  std::size_t size = 256;
+  char local_buff[256];
+  auto ret = std::vsnprintf(local_buff, size, msg, args_cp);
+
+  va_end(args_cp);
+
+  // currently there is no error handling for failure, so this is hack.
+  CHECK(ret >= 0);
+
+  if (ret == 0) // handle empty expansion
+    return {};
+  else if (static_cast<size_t>(ret) < size)
+    return local_buff;
+  else {
+    // we did not provide a long enough buffer on our first attempt.
+    size = (size_t)ret + 1; // + 1 for the null byte
+    std::unique_ptr<char[]> buff(new char[size]);
+    ret = std::vsnprintf(buff.get(), size, msg, args);
+    CHECK(ret > 0 && ((size_t)ret) < size);
+    return buff.get();
+  }
+}
+
+std::string FormatString(const char *msg, ...) {
+  va_list args;
+  va_start(args, msg);
+  auto tmp = FormatString(msg, args);
+  va_end(args);
+  return tmp;
+}
+
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+  if (!FLAGS_color_print) {
+    out << FormatString(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#ifdef BENCHMARK_OS_WINDOWS
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  const char* color_code = GetPlatformColorCode(color);
+  if (color_code) out << FormatString("\033[0;3%sm", color_code);
+  out << FormatString(fmt, args) << "\033[m";
+#endif
+
+  va_end(args);
+}
+
+}  // end namespace benchmark
Index: utils/google-benchmark/src/commandlineflags.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/commandlineflags.h
@@ -0,0 +1,76 @@
+#ifndef BENCHMARK_COMMANDLINEFLAGS_H_
+#define BENCHMARK_COMMANDLINEFLAGS_H_
+
+#include <cstdint>
+#include <string>
+
+// Macro for referencing flags.
+#define FLAG(name) FLAGS_##name
+
+// Macros for declaring flags.
+#define DECLARE_bool(name) extern bool FLAG(name)
+#define DECLARE_int32(name) extern int32_t FLAG(name)
+#define DECLARE_int64(name) extern int64_t FLAG(name)
+#define DECLARE_double(name) extern double FLAG(name)
+#define DECLARE_string(name) extern std::string FLAG(name)
+
+// Macros for defining flags.
+#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
+#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
+#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
+#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
+#define DEFINE_string(name, default_val, doc) \
+  std::string FLAG(name) = (default_val)
+
+namespace benchmark {
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromEnv(const char* flag, bool default_val);
+int32_t Int32FromEnv(const char* flag, int32_t default_val);
+double DoubleFromEnv(const char* flag, double default_val);
+const char* StringFromEnv(const char* flag, const char* default_val);
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value);
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
+
+// Parses a string for a Double flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseDoubleFlag(const char* str, const char* flag, double* value);
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value);
+
+// Returns true if the string matches the flag.
+bool IsFlag(const char* str, const char* flag);
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_COMMANDLINEFLAGS_H_
Index: utils/google-benchmark/src/commandlineflags.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/commandlineflags.cc
@@ -0,0 +1,220 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "commandlineflags.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+
+namespace benchmark {
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = nullptr;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    std::cerr << src_text << " is expected to be a 32-bit integer, "
+              << "but actually has value \"" << str << "\".\n";
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const int32_t result = static_cast<int32_t>(long_value);
+  if (long_value == std::numeric_limits<long>::max() ||
+      long_value == std::numeric_limits<long>::min() ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+          // The parsed value overflows as an Int32.
+      ) {
+    std::cerr << src_text << " is expected to be a 32-bit integer, "
+              << "but actually has value \"" << str << "\", "
+              << "which overflows.\n";
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Parses 'str' for a double.  If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseDouble(const std::string& src_text, const char* str, double* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = nullptr;
+  const double double_value = strtod(str, &end);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    std::cerr << src_text << " is expected to be a double, "
+              << "but actually has value \"" << str << "\".\n";
+    return false;
+  }
+
+  *value = double_value;
+  return true;
+}
+
+inline const char* GetEnv(const char* name) {
+#if defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (nullptr).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != nullptr && env[0] != '\0') ? env : nullptr;
+#else
+  return getenv(name);
+#endif
+}
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "BENCHMARK_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string flag_str(flag);
+
+  std::string env_var;
+  for (size_t i = 0; i != flag_str.length(); ++i)
+    env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
+
+  return "BENCHMARK_" + env_var;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromEnv(const char* flag, bool default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = GetEnv(env_var.c_str());
+  return string_value == nullptr ? default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+int32_t Int32FromEnv(const char* flag, int32_t default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = GetEnv(env_var.c_str());
+  if (string_value == nullptr) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  int32_t result = default_value;
+  if (!ParseInt32(std::string("Environment variable ") + env_var, string_value,
+                  &result)) {
+    std::cout << "The default value " << default_value << " is used.\n";
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromEnv(const char* flag, const char* default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = GetEnv(env_var.c_str());
+  return value == nullptr ? default_value : value;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or nullptr if the parsing failed.
+const char* ParseFlagValue(const char* str, const char* flag,
+                           bool def_optional) {
+  // str and flag must not be nullptr.
+  if (str == nullptr || flag == nullptr) return nullptr;
+
+  // The flag must start with "--".
+  const std::string flag_str = std::string("--") + std::string(flag);
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) return flag_end;
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return nullptr;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(std::string("The value of flag --") + flag, value_str,
+                    value);
+}
+
+bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseDouble(std::string("The value of flag --") + flag, value_str,
+                     value);
+}
+
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  *value = value_str;
+  return true;
+}
+
+bool IsFlag(const char* str, const char* flag) {
+  return (ParseFlagValue(str, flag, true) != nullptr);
+}
+}  // end namespace benchmark
Index: utils/google-benchmark/src/complexity.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/complexity.h
@@ -0,0 +1,64 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Source project : https://github.com/ismaelJimenez/cpp.leastsq
+// Adapted to be used with google benchmark
+
+#ifndef COMPLEXITY_H_
+#define COMPLEXITY_H_
+
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark_api.h"
+#include "benchmark/reporter.h"
+
+namespace benchmark {
+
+// Return a vector containing the mean and standard devation information for
+// the specified list of reports. If 'reports' contains less than two
+// non-errored runs an empty vector is returned
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports);
+
+// Return a vector containing the bigO and RMS information for the specified
+// list of reports. If 'reports.size() < 2' an empty vector is returned.
+std::vector<BenchmarkReporter::Run> ComputeBigO(
+    const std::vector<BenchmarkReporter::Run>& reports);
+
+// This data structure will contain the result returned by MinimalLeastSq
+//   - coef        : Estimated coeficient for the high-order term as
+//                   interpolated from data.
+//   - rms         : Normalized Root Mean Squared Error.
+//   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
+//                   form has been provided to MinimalLeastSq this will return
+//                   the same value. In case BigO::oAuto has been selected, this
+//                   parameter will return the best fitting curve detected.
+
+struct LeastSq {
+  LeastSq() :
+    coef(0.0),
+    rms(0.0),
+    complexity(oNone) {}
+
+  double coef;
+  double rms;
+  BigO complexity;
+};
+
+// Function to return an string for the calculated complexity
+std::string GetBigOString(BigO complexity);
+
+} // end namespace benchmark
+#endif // COMPLEXITY_H_
Index: utils/google-benchmark/src/complexity.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/complexity.cc
@@ -0,0 +1,283 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Source project : https://github.com/ismaelJimenez/cpp.leastsq
+// Adapted to be used with google benchmark
+
+#include "benchmark/benchmark_api.h"
+
+#include <algorithm>
+#include <cmath>
+#include "check.h"
+#include "complexity.h"
+#include "stat.h"
+
+namespace benchmark {
+
+// Internal function to calculate the different scalability forms
+BigOFunc* FittingCurve(BigO complexity) {
+  switch (complexity) {
+    case oN:
+      return [](int n) -> double { return n; };
+    case oNSquared:
+      return [](int n) -> double { return n * n; };
+    case oNCubed:
+      return [](int n) -> double { return n * n * n; };
+    case oLogN:
+      return [](int n) { return std::log2(n); };
+    case oNLogN:
+      return [](int n) { return n * std::log2(n); };
+    case o1:
+    default:
+      return [](int) { return 1.0; };
+  }
+}
+
+// Function to return an string for the calculated complexity
+std::string GetBigOString(BigO complexity) {
+  switch (complexity) {
+    case oN:
+      return "N";
+    case oNSquared:
+      return "N^2";
+    case oNCubed:
+      return "N^3";
+    case oLogN:
+      return "lgN";
+    case oNLogN:
+      return "NlgN";
+    case o1:
+      return "(1)";
+    default:
+      return "f(N)";
+  }
+}
+
+// Find the coefficient for the high-order term in the running time, by
+// minimizing the sum of squares of relative error, for the fitting curve
+// given by the lambda expresion.
+//   - n             : Vector containing the size of the benchmark tests.
+//   - time          : Vector containing the times for the benchmark tests.
+//   - fitting_curve : lambda expresion (e.g. [](int n) {return n; };).
+
+// For a deeper explanation on the algorithm logic, look the README file at
+// http://github.com/ismaelJimenez/Minimal-Cpp-Least-Squared-Fit
+
+LeastSq MinimalLeastSq(const std::vector<int>& n,
+                       const std::vector<double>& time,
+                       BigOFunc* fitting_curve) {
+  double sigma_gn = 0.0;
+  double sigma_gn_squared = 0.0;
+  double sigma_time = 0.0;
+  double sigma_time_gn = 0.0;
+
+  // Calculate least square fitting parameter
+  for (size_t i = 0; i < n.size(); ++i) {
+    double gn_i = fitting_curve(n[i]);
+    sigma_gn += gn_i;
+    sigma_gn_squared += gn_i * gn_i;
+    sigma_time += time[i];
+    sigma_time_gn += time[i] * gn_i;
+  }
+
+  LeastSq result;
+  result.complexity = oLambda;
+
+  // Calculate complexity.
+  result.coef = sigma_time_gn / sigma_gn_squared;
+
+  // Calculate RMS
+  double rms = 0.0;
+  for (size_t i = 0; i < n.size(); ++i) {
+    double fit = result.coef * fitting_curve(n[i]);
+    rms += pow((time[i] - fit), 2);
+  }
+
+  // Normalized RMS by the mean of the observed values
+  double mean = sigma_time / n.size();
+  result.rms = sqrt(rms / n.size()) / mean;
+
+  return result;
+}
+
+// Find the coefficient for the high-order term in the running time, by
+// minimizing the sum of squares of relative error.
+//   - n          : Vector containing the size of the benchmark tests.
+//   - time       : Vector containing the times for the benchmark tests.
+//   - complexity : If different than oAuto, the fitting curve will stick to
+//                  this one. If it is oAuto, it will be calculated the best
+//                  fitting curve.
+LeastSq MinimalLeastSq(const std::vector<int>& n,
+                       const std::vector<double>& time,
+                       const BigO complexity) {
+  CHECK_EQ(n.size(), time.size());
+  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                          // benchmark runs are given
+  CHECK_NE(complexity, oNone);
+
+  LeastSq best_fit;
+
+  if (complexity == oAuto) {
+    std::vector<BigO> fit_curves = {oLogN, oN, oNLogN, oNSquared, oNCubed};
+
+    // Take o1 as default best fitting curve
+    best_fit = MinimalLeastSq(n, time, FittingCurve(o1));
+    best_fit.complexity = o1;
+
+    // Compute all possible fitting curves and stick to the best one
+    for (const auto& fit : fit_curves) {
+      LeastSq current_fit = MinimalLeastSq(n, time, FittingCurve(fit));
+      if (current_fit.rms < best_fit.rms) {
+        best_fit = current_fit;
+        best_fit.complexity = fit;
+      }
+    }
+  } else {
+    best_fit = MinimalLeastSq(n, time, FittingCurve(complexity));
+    best_fit.complexity = complexity;
+  }
+
+  return best_fit;
+}
+
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports) {
+  typedef BenchmarkReporter::Run Run;
+  std::vector<Run> results;
+
+  auto error_count =
+      std::count_if(reports.begin(), reports.end(),
+                    [](Run const& run) { return run.error_occurred; });
+
+  if (reports.size() - error_count < 2) {
+    // We don't report aggregated data if there was a single run.
+    return results;
+  }
+  // Accumulators.
+  Stat1_d real_accumulated_time_stat;
+  Stat1_d cpu_accumulated_time_stat;
+  Stat1_d bytes_per_second_stat;
+  Stat1_d items_per_second_stat;
+  // All repetitions should be run with the same number of iterations so we
+  // can take this information from the first benchmark.
+  int64_t const run_iterations = reports.front().iterations;
+
+  // Populate the accumulators.
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    CHECK_EQ(run_iterations, run.iterations);
+    if (run.error_occurred) continue;
+    real_accumulated_time_stat +=
+        Stat1_d(run.real_accumulated_time / run.iterations, run.iterations);
+    cpu_accumulated_time_stat +=
+        Stat1_d(run.cpu_accumulated_time / run.iterations, run.iterations);
+    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
+    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
+  }
+
+  // Get the data from the accumulator to BenchmarkReporter::Run's.
+  Run mean_data;
+  mean_data.benchmark_name = reports[0].benchmark_name + "_mean";
+  mean_data.iterations = run_iterations;
+  mean_data.real_accumulated_time =
+      real_accumulated_time_stat.Mean() * run_iterations;
+  mean_data.cpu_accumulated_time =
+      cpu_accumulated_time_stat.Mean() * run_iterations;
+  mean_data.bytes_per_second = bytes_per_second_stat.Mean();
+  mean_data.items_per_second = items_per_second_stat.Mean();
+
+  // Only add label to mean/stddev if it is same for all runs
+  mean_data.report_label = reports[0].report_label;
+  for (std::size_t i = 1; i < reports.size(); i++) {
+    if (reports[i].report_label != reports[0].report_label) {
+      mean_data.report_label = "";
+      break;
+    }
+  }
+
+  Run stddev_data;
+  stddev_data.benchmark_name = reports[0].benchmark_name + "_stddev";
+  stddev_data.report_label = mean_data.report_label;
+  stddev_data.iterations = 0;
+  stddev_data.real_accumulated_time = real_accumulated_time_stat.StdDev();
+  stddev_data.cpu_accumulated_time = cpu_accumulated_time_stat.StdDev();
+  stddev_data.bytes_per_second = bytes_per_second_stat.StdDev();
+  stddev_data.items_per_second = items_per_second_stat.StdDev();
+
+  results.push_back(mean_data);
+  results.push_back(stddev_data);
+  return results;
+}
+
+std::vector<BenchmarkReporter::Run> ComputeBigO(
+    const std::vector<BenchmarkReporter::Run>& reports) {
+  typedef BenchmarkReporter::Run Run;
+  std::vector<Run> results;
+
+  if (reports.size() < 2) return results;
+
+  // Accumulators.
+  std::vector<int> n;
+  std::vector<double> real_time;
+  std::vector<double> cpu_time;
+
+  // Populate the accumulators.
+  for (const Run& run : reports) {
+    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    n.push_back(run.complexity_n);
+    real_time.push_back(run.real_accumulated_time / run.iterations);
+    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
+  }
+
+  LeastSq result_cpu;
+  LeastSq result_real;
+
+  if (reports[0].complexity == oLambda) {
+    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda);
+    result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda);
+  } else {
+    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
+    result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
+  }
+  std::string benchmark_name =
+      reports[0].benchmark_name.substr(0, reports[0].benchmark_name.find('/'));
+
+  // Get the data from the accumulator to BenchmarkReporter::Run's.
+  Run big_o;
+  big_o.benchmark_name = benchmark_name + "_BigO";
+  big_o.iterations = 0;
+  big_o.real_accumulated_time = result_real.coef;
+  big_o.cpu_accumulated_time = result_cpu.coef;
+  big_o.report_big_o = true;
+  big_o.complexity = result_cpu.complexity;
+
+  double multiplier = GetTimeUnitMultiplier(reports[0].time_unit);
+
+  // Only add label to mean/stddev if it is same for all runs
+  Run rms;
+  big_o.report_label = reports[0].report_label;
+  rms.benchmark_name = benchmark_name + "_RMS";
+  rms.report_label = big_o.report_label;
+  rms.iterations = 0;
+  rms.real_accumulated_time = result_real.rms / multiplier;
+  rms.cpu_accumulated_time = result_cpu.rms / multiplier;
+  rms.report_rms = true;
+  rms.complexity = result_cpu.complexity;
+
+  results.push_back(big_o);
+  results.push_back(rms);
+  return results;
+}
+
+}  // end namespace benchmark
Index: utils/google-benchmark/src/console_reporter.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/console_reporter.cc
@@ -0,0 +1,124 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/reporter.h"
+#include "complexity.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "internal_macros.h"
+#include "string_util.h"
+#include "walltime.h"
+
+DECLARE_bool(color_print);
+
+namespace benchmark {
+
+bool ConsoleReporter::ReportContext(const Context& context) {
+  name_field_width_ = context.name_field_width;
+
+  PrintBasicContext(&GetErrorStream(), context);
+
+#ifdef BENCHMARK_OS_WINDOWS
+  if (FLAGS_color_print && &std::cout != &GetOutputStream()) {
+      GetErrorStream() << "Color printing is only supported for stdout on windows."
+                          " Disabling color printing\n";
+      FLAGS_color_print = false;
+  }
+#endif
+  std::string str = FormatString("%-*s %13s %13s %10s\n",
+                             static_cast<int>(name_field_width_), "Benchmark",
+                             "Time", "CPU", "Iterations");
+  GetOutputStream() << str << std::string(str.length() - 1, '-') << "\n";
+
+  return true;
+}
+
+void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
+  for (const auto& run : reports)
+    PrintRunData(run);
+}
+
+void ConsoleReporter::PrintRunData(const Run& result) {
+  auto& Out = GetOutputStream();
+
+  auto name_color =
+      (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
+  ColorPrintf(Out, name_color, "%-*s ", name_field_width_,
+              result.benchmark_name.c_str());
+
+  if (result.error_occurred) {
+    ColorPrintf(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
+                result.error_message.c_str());
+    ColorPrintf(Out, COLOR_DEFAULT, "\n");
+    return;
+  }
+  // Format bytes per second
+  std::string rate;
+  if (result.bytes_per_second > 0) {
+    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
+  }
+
+  // Format items per second
+  std::string items;
+  if (result.items_per_second > 0) {
+    items = StrCat(" ", HumanReadableNumber(result.items_per_second),
+                   " items/s");
+ }
+
+  const double real_time = result.GetAdjustedRealTime();
+  const double cpu_time = result.GetAdjustedCPUTime();
+
+  if (result.report_big_o) {
+    std::string big_o = GetBigOString(result.complexity);
+    ColorPrintf(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time,
+                big_o.c_str(), cpu_time, big_o.c_str());
+  } else if (result.report_rms) {
+    ColorPrintf(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100,
+                cpu_time * 100);
+  } else {
+    const char* timeLabel = GetTimeUnitString(result.time_unit);
+    ColorPrintf(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel,
+                cpu_time, timeLabel);
+  }
+
+  if (!result.report_big_o && !result.report_rms) {
+    ColorPrintf(Out, COLOR_CYAN, "%10lld", result.iterations);
+  }
+
+  if (!rate.empty()) {
+    ColorPrintf(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
+  }
+
+  if (!items.empty()) {
+    ColorPrintf(Out, COLOR_DEFAULT, " %*s", 18, items.c_str());
+  }
+
+  if (!result.report_label.empty()) {
+    ColorPrintf(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
+  }
+
+  ColorPrintf(Out, COLOR_DEFAULT, "\n");
+}
+
+}  // end namespace benchmark
Index: utils/google-benchmark/src/csv_reporter.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/csv_reporter.cc
@@ -0,0 +1,118 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/reporter.h"
+#include "complexity.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "string_util.h"
+#include "walltime.h"
+
+// File format reference: http://edoceo.com/utilitas/csv-file-format.
+
+namespace benchmark {
+
+namespace {
+std::vector<std::string> elements = {
+  "name",
+  "iterations",
+  "real_time",
+  "cpu_time",
+  "time_unit",
+  "bytes_per_second",
+  "items_per_second",
+  "label",
+  "error_occurred",
+  "error_message"
+};
+}
+
+bool CSVReporter::ReportContext(const Context& context) {
+  PrintBasicContext(&GetErrorStream(), context);
+
+  std::ostream& Out = GetOutputStream();
+  for (auto B = elements.begin(); B != elements.end(); ) {
+    Out << *B++;
+    if (B != elements.end())
+      Out << ",";
+  }
+  Out << "\n";
+  return true;
+}
+
+void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
+  for (const auto& run : reports)
+    PrintRunData(run);
+}
+
+void CSVReporter::PrintRunData(const Run & run) {
+  std::ostream& Out = GetOutputStream();
+
+  // Field with embedded double-quote characters must be doubled and the field
+  // delimited with double-quotes.
+  std::string name = run.benchmark_name;
+  ReplaceAll(&name, "\"", "\"\"");
+  Out << '"' << name << "\",";
+  if (run.error_occurred) {
+    Out << std::string(elements.size() - 3, ',');
+    Out << "true,";
+    std::string msg = run.error_message;
+    ReplaceAll(&msg, "\"", "\"\"");
+    Out << '"' << msg << "\"\n";
+    return;
+  }
+
+  // Do not print iteration on bigO and RMS report
+  if (!run.report_big_o && !run.report_rms) {
+    Out << run.iterations;
+  }
+  Out << ",";
+
+  Out << run.GetAdjustedRealTime() << ",";
+  Out << run.GetAdjustedCPUTime() << ",";
+
+  // Do not print timeLabel on bigO and RMS report
+  if (run.report_big_o) {
+    Out << GetBigOString(run.complexity);
+  } else if (!run.report_rms) {
+    Out << GetTimeUnitString(run.time_unit);
+  }
+  Out << ",";
+
+  if (run.bytes_per_second > 0.0) {
+    Out << run.bytes_per_second;
+  }
+  Out << ",";
+  if (run.items_per_second > 0.0) {
+    Out << run.items_per_second;
+  }
+  Out << ",";
+  if (!run.report_label.empty()) {
+    // Field with embedded double-quote characters must be doubled and the field
+    // delimited with double-quotes.
+    std::string label = run.report_label;
+    ReplaceAll(&label, "\"", "\"\"");
+    Out << "\"" << label << "\"";
+  }
+  Out << ",,";  // for error_occurred and error_message
+  Out << '\n';
+}
+
+}  // end namespace benchmark
Index: utils/google-benchmark/src/cycleclock.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/cycleclock.h
@@ -0,0 +1,145 @@
+// ----------------------------------------------------------------------
+// CycleClock
+//    A CycleClock tells you the current time in Cycles.  The "time"
+//    is actually time since power-on.  This is like time() but doesn't
+//    involve a system call and is much more precise.
+//
+// NOTE: Not all cpu/platform/kernel combinations guarantee that this
+// clock increments at a constant rate or is synchronized across all logical
+// cpus in a system.
+//
+// If you need the above guarantees, please consider using a different
+// API. There are efforts to provide an interface which provides a millisecond
+// granularity and implemented as a memory read. A memory read is generally
+// cheaper than the CycleClock for many architectures.
+//
+// Also, in some out of order CPU implementations, the CycleClock is not
+// serializing. So if you're trying to count at cycles granularity, your
+// data might be inaccurate due to out of order instruction execution.
+// ----------------------------------------------------------------------
+
+#ifndef BENCHMARK_CYCLECLOCK_H_
+#define BENCHMARK_CYCLECLOCK_H_
+
+#include <cstdint>
+
+#include "benchmark/macros.h"
+#include "internal_macros.h"
+
+#if defined(BENCHMARK_OS_MACOSX)
+#include <mach/mach_time.h>
+#endif
+// For MSVC, we want to use '_asm rdtsc' when possible (since it works
+// with even ancient MSVC compilers), and when not possible the
+// __rdtsc intrinsic, declared in <intrin.h>.  Unfortunately, in some
+// environments, <windows.h> and <intrin.h> have conflicting
+// declarations of some other intrinsics, breaking compilation.
+// Therefore, we simply declare __rdtsc ourselves. See also
+// http://connect.microsoft.com/VisualStudio/feedback/details/262047
+#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+extern "C" uint64_t __rdtsc();
+#pragma intrinsic(__rdtsc)
+#endif
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <sys/time.h>
+#endif
+
+namespace benchmark {
+// NOTE: only i386 and x86_64 have been well tested.
+// PPC, sparc, alpha, and ia64 are based on
+//    http://peter.kuscsik.com/wordpress/?p=14
+// with modifications by m3b.  See also
+//    https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
+namespace cycleclock {
+// This should return the number of cycles since power-on.  Thread-safe.
+inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
+#if defined(BENCHMARK_OS_MACOSX)
+  // this goes at the top because we need ALL Macs, regardless of
+  // architecture, to return the number of "mach time units" that
+  // have passed since startup.  See sysinfo.cc where
+  // InitializeSystemInfo() sets the supposed cpu clock frequency of
+  // macs to the number of mach time units per second, not actual
+  // CPU clock frequency (which can change in the face of CPU
+  // frequency scaling).  Also note that when the Mac sleeps, this
+  // counter pauses; it does not continue counting, nor does it
+  // reset to zero.
+  return mach_absolute_time();
+#elif defined(__i386__)
+  int64_t ret;
+  __asm__ volatile("rdtsc" : "=A"(ret));
+  return ret;
+#elif defined(__x86_64__) || defined(__amd64__)
+  uint64_t low, high;
+  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
+  return (high << 32) | low;
+#elif defined(__powerpc__) || defined(__ppc__)
+  // This returns a time-base, which is not always precisely a cycle-count.
+  int64_t tbl, tbu0, tbu1;
+  asm("mftbu %0" : "=r"(tbu0));
+  asm("mftb  %0" : "=r"(tbl));
+  asm("mftbu %0" : "=r"(tbu1));
+  tbl &= -static_cast<int64>(tbu0 == tbu1);
+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
+  return (tbu1 << 32) | tbl;
+#elif defined(__sparc__)
+  int64_t tick;
+  asm(".byte 0x83, 0x41, 0x00, 0x00");
+  asm("mov   %%g1, %0" : "=r"(tick));
+  return tick;
+#elif defined(__ia64__)
+  int64_t itc;
+  asm("mov %0 = ar.itc" : "=r"(itc));
+  return itc;
+#elif defined(COMPILER_MSVC) && defined(_M_IX86)
+  // Older MSVC compilers (like 7.x) don't seem to support the
+  // __rdtsc intrinsic properly, so I prefer to use _asm instead
+  // when I know it will work.  Otherwise, I'll use __rdtsc and hope
+  // the code is being compiled with a non-ancient compiler.
+  _asm rdtsc
+#elif defined(COMPILER_MSVC)
+  return __rdtsc();
+#elif defined(__aarch64__)
+  // System timer of ARMv8 runs at a different frequency than the CPU's.
+  // The frequency is fixed, typically in the range 1-50MHz.  It can be
+  // read at CNTFRQ special register.  We assume the OS has set up
+  // the virtual timer properly.
+  int64_t virtual_timer_value;
+  asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
+  return virtual_timer_value;
+#elif defined(__ARM_ARCH)
+#if (__ARM_ARCH >= 6)  // V6 is the earliest arch that has a standard cyclecount
+  uint32_t pmccntr;
+  uint32_t pmuseren;
+  uint32_t pmcntenset;
+  // Read the user mode perf monitor counter access permissions.
+  asm("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+  if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
+    asm("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+    if (pmcntenset & 0x80000000ul) {  // Is it counting?
+      asm("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+      // The counter is set up to count every 64th cycle
+      return static_cast<int64_t>(pmccntr) * 64;  // Should optimize to << 6
+    }
+  }
+#endif
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__mips__)
+  // mips apparently only allows rdtsc for superusers, so we fall
+  // back to gettimeofday.  It's possible clock_gettime would be better.
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#else
+// The soft failover to a generic implementation is automatic only for ARM.
+// For other platforms the developer is expected to make an attempt to create
+// a fast implementation and use generic version if nothing better is available.
+#error You need to define CycleTimer for your OS and CPU
+#endif
+}
+}  // end namespace cycleclock
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_CYCLECLOCK_H_
Index: utils/google-benchmark/src/internal_macros.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/internal_macros.h
@@ -0,0 +1,40 @@
+#ifndef BENCHMARK_INTERNAL_MACROS_H_
+#define BENCHMARK_INTERNAL_MACROS_H_
+
+#include "benchmark/macros.h"
+
+#ifndef __has_feature
+# define __has_feature(x) 0
+#endif
+
+#if __has_feature(cxx_attributes)
+# define BENCHMARK_NORETURN [[noreturn]]
+#elif defined(__GNUC__)
+# define BENCHMARK_NORETURN __attribute__((noreturn))
+#else
+# define BENCHMARK_NORETURN
+#endif
+
+#if defined(__CYGWIN__)
+# define BENCHMARK_OS_CYGWIN 1
+#elif defined(_WIN32)
+# define BENCHMARK_OS_WINDOWS 1
+#elif defined(__APPLE__)
+// TODO(ericwf) This doesn't actually check that it is a Mac OSX system. Just
+// that it is an apple system.
+# define BENCHMARK_OS_MACOSX 1
+#elif defined(__FreeBSD__)
+# define BENCHMARK_OS_FREEBSD 1
+#elif defined(__linux__)
+# define BENCHMARK_OS_LINUX 1
+#endif
+
+#if defined(__clang__)
+# define COMPILER_CLANG
+#elif defined(_MSC_VER)
+# define COMPILER_MSVC
+#elif defined(__GNUC__)
+# define COMPILER_GCC
+#endif
+
+#endif // BENCHMARK_INTERNAL_MACROS_H_
Index: utils/google-benchmark/src/json_reporter.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/json_reporter.cc
@@ -0,0 +1,178 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/reporter.h"
+#include "complexity.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "string_util.h"
+#include "walltime.h"
+
+namespace benchmark {
+
+namespace {
+
+std::string FormatKV(std::string const& key, std::string const& value) {
+  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value.c_str());
+}
+
+std::string FormatKV(std::string const& key, const char* value) {
+  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value);
+}
+
+std::string FormatKV(std::string const& key, bool value) {
+  return StringPrintF("\"%s\": %s", key.c_str(), value ? "true" : "false");
+}
+
+std::string FormatKV(std::string const& key, int64_t value) {
+  std::stringstream ss;
+  ss << '"' << key << "\": " << value;
+  return ss.str();
+}
+
+int64_t RoundDouble(double v) {
+    return static_cast<int64_t>(v + 0.5);
+}
+
+} // end namespace
+
+bool JSONReporter::ReportContext(const Context& context) {
+  std::ostream& out = GetOutputStream();
+
+  out << "{\n";
+  std::string inner_indent(2, ' ');
+
+  // Open context block and print context information.
+  out << inner_indent << "\"context\": {\n";
+  std::string indent(4, ' ');
+
+  std::string walltime_value = LocalDateTimeString();
+  out << indent << FormatKV("date", walltime_value) << ",\n";
+
+  out << indent
+      << FormatKV("num_cpus", static_cast<int64_t>(context.num_cpus))
+      << ",\n";
+  out << indent
+      << FormatKV("mhz_per_cpu", RoundDouble(context.mhz_per_cpu))
+      << ",\n";
+  out << indent
+      << FormatKV("cpu_scaling_enabled", context.cpu_scaling_enabled)
+      << ",\n";
+
+#if defined(NDEBUG)
+  const char build_type[] = "release";
+#else
+  const char build_type[] = "debug";
+#endif
+  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  // Close context block and open the list of benchmarks.
+  out << inner_indent << "},\n";
+  out << inner_indent << "\"benchmarks\": [\n";
+  return true;
+}
+
+void JSONReporter::ReportRuns(std::vector<Run> const& reports) {
+  if (reports.empty()) {
+    return;
+  }
+  std::string indent(4, ' ');
+  std::ostream& out = GetOutputStream();
+  if (!first_report_) {
+    out << ",\n";
+  }
+  first_report_ = false;
+
+  for (auto it = reports.begin(); it != reports.end(); ++it) {
+    out << indent << "{\n";
+    PrintRunData(*it);
+    out << indent << '}';
+    auto it_cp = it;
+    if (++it_cp != reports.end()) {
+      out << ",\n";
+    }
+  }
+}
+
+void JSONReporter::Finalize() {
+  // Close the list of benchmarks and the top level object.
+  GetOutputStream() << "\n  ]\n}\n";
+}
+
+void JSONReporter::PrintRunData(Run const& run) {
+  std::string indent(6, ' ');
+  std::ostream& out = GetOutputStream();
+    out << indent
+        << FormatKV("name", run.benchmark_name)
+        << ",\n";
+    if (run.error_occurred) {
+        out << indent
+            << FormatKV("error_occurred", run.error_occurred)
+            << ",\n";
+        out << indent
+            << FormatKV("error_message", run.error_message)
+            << ",\n";
+    }
+  if (!run.report_big_o && !run.report_rms) {
+        out << indent
+            << FormatKV("iterations", run.iterations)
+            << ",\n";
+        out << indent
+            << FormatKV("real_time", RoundDouble(run.GetAdjustedRealTime()))
+            << ",\n";
+        out << indent
+            << FormatKV("cpu_time", RoundDouble(run.GetAdjustedCPUTime()));
+        out << ",\n" << indent
+            << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+  } else if (run.report_big_o) {
+    out << indent
+        << FormatKV("cpu_coefficient", RoundDouble(run.GetAdjustedCPUTime()))
+        << ",\n";
+    out << indent
+        << FormatKV("real_coefficient", RoundDouble(run.GetAdjustedRealTime()))
+        << ",\n";
+    out << indent
+            << FormatKV("big_o", GetBigOString(run.complexity))
+            << ",\n";
+        out << indent
+            << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+    } else if(run.report_rms) {
+        out << indent
+            << FormatKV("rms", RoundDouble(run.GetAdjustedCPUTime()*100))
+            << '%';
+  }
+  if (run.bytes_per_second > 0.0) {
+    out << ",\n"
+        << indent
+        << FormatKV("bytes_per_second", RoundDouble(run.bytes_per_second));
+  }
+  if (run.items_per_second > 0.0) {
+    out << ",\n"
+        << indent
+        << FormatKV("items_per_second", RoundDouble(run.items_per_second));
+  }
+  if (!run.report_label.empty()) {
+    out << ",\n"
+        << indent
+        << FormatKV("label", run.report_label);
+  }
+  out << '\n';
+}
+
+}  // end namespace benchmark
Index: utils/google-benchmark/src/log.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/log.h
@@ -0,0 +1,28 @@
+#ifndef BENCHMARK_LOG_H_
+#define BENCHMARK_LOG_H_
+
+#include <ostream>
+
+namespace benchmark {
+namespace internal {
+
+int GetLogLevel();
+void SetLogLevel(int level);
+
+std::ostream& GetNullLogInstance();
+std::ostream& GetErrorLogInstance();
+
+inline std::ostream& GetLogInstanceForLevel(int level) {
+  if (level <= GetLogLevel()) {
+    return GetErrorLogInstance();
+  }
+  return GetNullLogInstance();
+}
+
+} // end namespace internal
+} // end namespace benchmark
+
+#define VLOG(x) (::benchmark::internal::GetLogInstanceForLevel(x) \
+                 << "-- LOG(" << x << "): ")
+
+#endif
\ No newline at end of file
Index: utils/google-benchmark/src/log.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/log.cc
@@ -0,0 +1,40 @@
+#include "log.h"
+
+#include <iostream>
+
+namespace benchmark {
+namespace internal {
+
+int& LoggingLevelImp() {
+    static int level = 0;
+    return level;
+}
+
+void SetLogLevel(int value) {
+    LoggingLevelImp() = value;
+}
+
+int GetLogLevel() {
+    return LoggingLevelImp();
+}
+
+class NullLogBuffer : public std::streambuf
+{
+public:
+  int overflow(int c) {
+    return c;
+  }
+};
+
+std::ostream& GetNullLogInstance() {
+  static NullLogBuffer log_buff;
+  static std::ostream null_log(&log_buff);
+  return null_log;
+}
+
+std::ostream& GetErrorLogInstance() {
+  return std::clog;
+}
+
+} // end namespace internal
+} // end namespace benchmark
\ No newline at end of file
Index: utils/google-benchmark/src/mutex.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/mutex.h
@@ -0,0 +1,142 @@
+#ifndef BENCHMARK_MUTEX_H_
+#define BENCHMARK_MUTEX_H_
+
+#include <mutex>
+#include <condition_variable>
+
+// Enable thread safety attributes only with clang.
+// The attributes can be safely erased when compiling with other compilers.
+#if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
+#endif
+
+#define CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+
+#define SCOPED_CAPABILITY \
+  THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+
+#define GUARDED_BY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+
+#define PT_GUARDED_BY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+
+#define ACQUIRED_BEFORE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+
+#define ACQUIRED_AFTER(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+
+#define REQUIRES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+
+#define REQUIRES_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+
+#define ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+
+#define ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+
+#define RELEASE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+
+#define RELEASE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+
+#define EXCLUDES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+
+#define ASSERT_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+
+#define ASSERT_SHARED_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+
+#define RETURN_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+
+#define NO_THREAD_SAFETY_ANALYSIS \
+  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+
+
+namespace benchmark {
+
+typedef std::condition_variable Condition;
+
+// NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
+// we can annotate them with thread safety attributes and use the
+// -Wthread-safety warning with clang. The standard library types cannot be
+// used directly because they do not provided the required annotations.
+class CAPABILITY("mutex") Mutex
+{
+public:
+  Mutex() {}
+
+  void lock() ACQUIRE() { mut_.lock(); }
+  void unlock() RELEASE() { mut_.unlock(); }
+  std::mutex& native_handle() {
+    return mut_;
+  }
+private:
+  std::mutex mut_;
+};
+
+
+class SCOPED_CAPABILITY MutexLock
+{
+  typedef std::unique_lock<std::mutex> MutexLockImp;
+public:
+  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle())
+  { }
+  ~MutexLock() RELEASE() {}
+  MutexLockImp& native_handle() { return ml_; }
+private:
+  MutexLockImp ml_;
+};
+
+
+class Notification
+{
+public:
+  Notification() : notified_yet_(false) { }
+
+  void WaitForNotification() const EXCLUDES(mutex_) {
+    MutexLock m_lock(mutex_);
+    auto notified_fn = [this]() REQUIRES(mutex_) {
+                            return this->HasBeenNotified();
+                        };
+    cv_.wait(m_lock.native_handle(), notified_fn);
+  }
+
+  void Notify() EXCLUDES(mutex_) {
+    {
+      MutexLock lock(mutex_);
+      notified_yet_ = 1;
+    }
+    cv_.notify_all();
+  }
+
+private:
+  bool HasBeenNotified() const REQUIRES(mutex_) {
+    return notified_yet_;
+  }
+
+  mutable Mutex mutex_;
+  mutable std::condition_variable cv_;
+  bool notified_yet_ GUARDED_BY(mutex_);
+};
+
+} // end namespace benchmark
+
+#endif // BENCHMARK_MUTEX_H_
Index: utils/google-benchmark/src/re.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/re.h
@@ -0,0 +1,60 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_RE_H_
+#define BENCHMARK_RE_H_
+
+#if defined(HAVE_STD_REGEX)
+#include <regex>
+#elif defined(HAVE_GNU_POSIX_REGEX)
+#include <gnuregex.h>
+#elif defined(HAVE_POSIX_REGEX)
+#include <regex.h>
+#else
+#error No regular expression backend was found!
+#endif
+#include <string>
+
+namespace benchmark {
+
+// A wrapper around the POSIX regular expression API that provides automatic
+// cleanup
+class Regex {
+ public:
+  Regex();
+  ~Regex();
+
+  // Compile a regular expression matcher from spec.  Returns true on success.
+  //
+  // On failure (and if error is not nullptr), error is populated with a human
+  // readable error message if an error occurs.
+  bool Init(const std::string& spec, std::string* error);
+
+  // Returns whether str matches the compiled regular expression.
+  bool Match(const std::string& str);
+ private:
+  bool init_;
+  // Underlying regular expression object
+#if defined(HAVE_STD_REGEX)
+  std::regex re_;
+#elif defined(HAVE_POSIX_REGEX) || defined(HAVE_GNU_POSIX_REGEX)
+  regex_t re_;
+#else
+# error No regular expression backend implementation available
+#endif
+};
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_RE_H_
Index: utils/google-benchmark/src/re_posix.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/re_posix.cc
@@ -0,0 +1,59 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "check.h"
+#include "re.h"
+
+namespace benchmark {
+
+Regex::Regex() : init_(false) { }
+
+bool Regex::Init(const std::string& spec, std::string* error) {
+  int ec = regcomp(&re_, spec.c_str(), REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    if (error) {
+      size_t needed = regerror(ec, &re_, nullptr, 0);
+      char* errbuf = new char[needed];
+      regerror(ec, &re_, errbuf, needed);
+
+      // regerror returns the number of bytes necessary to null terminate
+      // the string, so we move that when assigning to error.
+      CHECK_NE(needed, 0);
+      error->assign(errbuf, needed - 1);
+
+      delete[] errbuf;
+    }
+
+    return false;
+  }
+
+  init_ = true;
+  return true;
+}
+
+Regex::~Regex() {
+  if (init_) {
+    regfree(&re_);
+  }
+}
+
+bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+
+  return regexec(&re_, str.c_str(), 0, nullptr, 0) == 0;
+}
+
+}  // end namespace benchmark
Index: utils/google-benchmark/src/re_std.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/re_std.cc
@@ -0,0 +1,44 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "re.h"
+
+namespace benchmark {
+
+Regex::Regex() : init_(false) { }
+
+bool Regex::Init(const std::string& spec, std::string* error) {
+  try {
+    re_ = std::regex(spec, std::regex_constants::extended);
+
+    init_ = true;
+  } catch (const std::regex_error& e) {
+    if (error) {
+      *error = e.what();
+    }
+  }
+  return init_;
+}
+
+Regex::~Regex() { }
+
+bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+
+  return std::regex_search(str, re_);
+}
+
+}  // end namespace benchmark
Index: utils/google-benchmark/src/reporter.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/reporter.cc
@@ -0,0 +1,75 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/reporter.h"
+#include "walltime.h"
+
+#include <cstdlib>
+
+#include <iostream>
+#include <vector>
+#include <tuple>
+
+#include "check.h"
+#include "stat.h"
+
+namespace benchmark {
+
+BenchmarkReporter::BenchmarkReporter()
+    : output_stream_(&std::cout), error_stream_(&std::cerr)
+{
+}
+
+BenchmarkReporter::~BenchmarkReporter() {
+}
+
+void BenchmarkReporter::PrintBasicContext(std::ostream *out_ptr,
+                                          Context const &context) {
+  CHECK(out_ptr) << "cannot be null";
+  auto& Out = *out_ptr;
+
+  Out << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
+            << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
+
+  Out << LocalDateTimeString() << "\n";
+
+  if (context.cpu_scaling_enabled) {
+    Out << "***WARNING*** CPU scaling is enabled, the benchmark "
+                 "real time measurements may be noisy and will incur extra "
+                 "overhead.\n";
+  }
+
+#ifndef NDEBUG
+  Out << "***WARNING*** Library was built as DEBUG. Timings may be "
+               "affected.\n";
+#endif
+}
+
+double BenchmarkReporter::Run::GetAdjustedRealTime() const {
+  double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
+  if (iterations != 0)
+    new_time /= static_cast<double>(iterations);
+  return new_time;
+}
+
+double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
+  double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
+  if (iterations != 0)
+    new_time /= static_cast<double>(iterations);
+  return new_time;
+}
+
+
+
+} // end namespace benchmark
Index: utils/google-benchmark/src/sleep.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/sleep.h
@@ -0,0 +1,17 @@
+#ifndef BENCHMARK_SLEEP_H_
+#define BENCHMARK_SLEEP_H_
+
+#include <cstdint>
+
+namespace benchmark {
+const int64_t kNumMillisPerSecond = 1000LL;
+const int64_t kNumMicrosPerMilli = 1000LL;
+const int64_t kNumMicrosPerSecond = kNumMillisPerSecond * 1000LL;
+const int64_t kNumNanosPerMicro = 1000LL;
+const int64_t kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
+
+void SleepForMilliseconds(int milliseconds);
+void SleepForSeconds(double seconds);
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_SLEEP_H_
Index: utils/google-benchmark/src/sleep.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/sleep.cc
@@ -0,0 +1,50 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sleep.h"
+
+#include <cerrno>
+#include <ctime>
+
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <Windows.h>
+#endif
+
+namespace benchmark {
+#ifdef BENCHMARK_OS_WINDOWS
+// Window's Sleep takes milliseconds argument.
+void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
+void SleepForSeconds(double seconds) {
+  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
+}
+#else   // BENCHMARK_OS_WINDOWS
+void SleepForMicroseconds(int microseconds) {
+  struct timespec sleep_time;
+  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
+  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
+  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
+    ;  // Ignore signals and wait for the full interval to elapse.
+}
+
+void SleepForMilliseconds(int milliseconds) {
+  SleepForMicroseconds(static_cast<int>(milliseconds) * kNumMicrosPerMilli);
+}
+
+void SleepForSeconds(double seconds) {
+  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
+}
+#endif  // BENCHMARK_OS_WINDOWS
+}  // end namespace benchmark
Index: utils/google-benchmark/src/stat.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/stat.h
@@ -0,0 +1,307 @@
+#ifndef BENCHMARK_STAT_H_
+#define BENCHMARK_STAT_H_
+
+#include <cmath>
+#include <limits>
+#include <ostream>
+#include <type_traits>
+
+
+namespace benchmark {
+
+template <typename VType, typename NumType>
+class Stat1;
+
+template <typename VType, typename NumType>
+class Stat1MinMax;
+
+typedef Stat1<float, int64_t> Stat1_f;
+typedef Stat1<double, int64_t> Stat1_d;
+typedef Stat1MinMax<float, int64_t> Stat1MinMax_f;
+typedef Stat1MinMax<double, int64_t> Stat1MinMax_d;
+
+template <typename VType>
+class Vector2;
+template <typename VType>
+class Vector3;
+template <typename VType>
+class Vector4;
+
+template <typename VType, typename NumType>
+class Stat1 {
+ public:
+  typedef Stat1<VType, NumType> Self;
+
+  Stat1() { Clear(); }
+  // Create a sample of value dat and weight 1
+  explicit Stat1(const VType &dat) {
+    sum_ = dat;
+    sum_squares_ = Sqr(dat);
+    numsamples_ = 1;
+  }
+  // Create statistics for all the samples between begin (included)
+  // and end(excluded)
+  explicit Stat1(const VType *begin, const VType *end) {
+    Clear();
+    for (const VType *item = begin; item < end; ++item) {
+      (*this) += Stat1(*item);
+    }
+  }
+  // Create a sample of value dat and weight w
+  Stat1(const VType &dat, const NumType &w) {
+    sum_ = w * dat;
+    sum_squares_ = w * Sqr(dat);
+    numsamples_ = w;
+  }
+  // Copy operator
+  Stat1(const Self &stat) {
+    sum_ = stat.sum_;
+    sum_squares_ = stat.sum_squares_;
+    numsamples_ = stat.numsamples_;
+  }
+
+  void Clear() {
+    numsamples_ = NumType();
+    sum_squares_ = sum_ = VType();
+  }
+
+  Self &operator=(const Self &stat) {
+    sum_ = stat.sum_;
+    sum_squares_ = stat.sum_squares_;
+    numsamples_ = stat.numsamples_;
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  Self &operator+=(const Self &stat) {
+    sum_ += stat.sum_;
+    sum_squares_ += stat.sum_squares_;
+    numsamples_ += stat.numsamples_;
+    return (*this);
+  }
+  // The operation opposite to +=
+  Self &operator-=(const Self &stat) {
+    sum_ -= stat.sum_;
+    sum_squares_ -= stat.sum_squares_;
+    numsamples_ -= stat.numsamples_;
+    return (*this);
+  }
+  // Multiply the weight of the set of samples by a factor k
+  Self &operator*=(const VType &k) {
+    sum_ *= k;
+    sum_squares_ *= k;
+    numsamples_ *= k;
+    return (*this);
+  }
+
+  // Merge statistics from two sample sets.
+  Self operator+(const Self &stat) const { return Self(*this) += stat; }
+
+  // The operation opposite to +
+  Self operator-(const Self &stat) const { return Self(*this) -= stat; }
+
+  // Multiply the weight of the set of samples by a factor k
+  Self operator*(const VType &k) const { return Self(*this) *= k; }
+
+  // Return the total weight of this sample set
+  NumType numSamples() const { return numsamples_; }
+
+  // Return the sum of this sample set
+  VType Sum() const { return sum_; }
+
+  // Return the mean of this sample set
+  VType Mean() const {
+    if (numsamples_ == 0) return VType();
+    return sum_ * (1.0 / numsamples_);
+  }
+
+  // Return the mean of this sample set and compute the standard deviation at
+  // the same time.
+  VType Mean(VType *stddev) const {
+    if (numsamples_ == 0) return VType();
+    VType mean = sum_ * (1.0 / numsamples_);
+    if (stddev) {
+      VType avg_squares = sum_squares_ * (1.0 / numsamples_);
+      *stddev = Sqrt(avg_squares - Sqr(mean));
+    }
+    return mean;
+  }
+
+  // Return the standard deviation of the sample set
+  VType StdDev() const {
+    if (numsamples_ == 0) return VType();
+    VType mean = Mean();
+    VType avg_squares = sum_squares_ * (1.0 / numsamples_);
+    return Sqrt(avg_squares - Sqr(mean));
+  }
+
+ private:
+  static_assert(std::is_integral<NumType>::value &&
+                !std::is_same<NumType, bool>::value,
+                "NumType must be an integral type that is not bool.");
+  // Let i be the index of the samples provided (using +=)
+  // and weight[i],value[i] be the data of sample #i
+  // then the variables have the following meaning:
+  NumType numsamples_;  // sum of weight[i];
+  VType sum_;           // sum of weight[i]*value[i];
+  VType sum_squares_;   // sum of weight[i]*value[i]^2;
+
+  // Template function used to square a number.
+  // For a vector we square all components
+  template <typename SType>
+  static inline SType Sqr(const SType &dat) {
+    return dat * dat;
+  }
+
+  template <typename SType>
+  static inline Vector2<SType> Sqr(const Vector2<SType> &dat) {
+    return dat.MulComponents(dat);
+  }
+
+  template <typename SType>
+  static inline Vector3<SType> Sqr(const Vector3<SType> &dat) {
+    return dat.MulComponents(dat);
+  }
+
+  template <typename SType>
+  static inline Vector4<SType> Sqr(const Vector4<SType> &dat) {
+    return dat.MulComponents(dat);
+  }
+
+  // Template function used to take the square root of a number.
+  // For a vector we square all components
+  template <typename SType>
+  static inline SType Sqrt(const SType &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    if (dat < 0) return 0;
+    return sqrt(dat);
+  }
+
+  template <typename SType>
+  static inline Vector2<SType> Sqrt(const Vector2<SType> &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    return Max(dat, Vector2<SType>()).Sqrt();
+  }
+
+  template <typename SType>
+  static inline Vector3<SType> Sqrt(const Vector3<SType> &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    return Max(dat, Vector3<SType>()).Sqrt();
+  }
+
+  template <typename SType>
+  static inline Vector4<SType> Sqrt(const Vector4<SType> &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    return Max(dat, Vector4<SType>()).Sqrt();
+  }
+};
+
+// Useful printing function
+template <typename VType, typename NumType>
+std::ostream &operator<<(std::ostream &out, const Stat1<VType, NumType> &s) {
+  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
+      << " nsamples = " << s.NumSamples() << "}";
+  return out;
+}
+
+// Stat1MinMax: same as Stat1, but it also
+// keeps the Min and Max values; the "-"
+// operator is disabled because it cannot be implemented
+// efficiently
+template <typename VType, typename NumType>
+class Stat1MinMax : public Stat1<VType, NumType> {
+ public:
+  typedef Stat1MinMax<VType, NumType> Self;
+
+  Stat1MinMax() { Clear(); }
+  // Create a sample of value dat and weight 1
+  explicit Stat1MinMax(const VType &dat) : Stat1<VType, NumType>(dat) {
+    max_ = dat;
+    min_ = dat;
+  }
+  // Create statistics for all the samples between begin (included)
+  // and end(excluded)
+  explicit Stat1MinMax(const VType *begin, const VType *end) {
+    Clear();
+    for (const VType *item = begin; item < end; ++item) {
+      (*this) += Stat1MinMax(*item);
+    }
+  }
+  // Create a sample of value dat and weight w
+  Stat1MinMax(const VType &dat, const NumType &w)
+      : Stat1<VType, NumType>(dat, w) {
+    max_ = dat;
+    min_ = dat;
+  }
+  // Copy operator
+  Stat1MinMax(const Self &stat) : Stat1<VType, NumType>(stat) {
+    max_ = stat.max_;
+    min_ = stat.min_;
+  }
+
+  void Clear() {
+    Stat1<VType, NumType>::Clear();
+    if (std::numeric_limits<VType>::has_infinity) {
+      min_ = std::numeric_limits<VType>::infinity();
+      max_ = -std::numeric_limits<VType>::infinity();
+    } else {
+      min_ = std::numeric_limits<VType>::max();
+      max_ = std::numeric_limits<VType>::min();
+    }
+  }
+
+  Self &operator=(const Self &stat) {
+    this->Stat1<VType, NumType>::operator=(stat);
+    max_ = stat.max_;
+    min_ = stat.min_;
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  Self &operator+=(const Self &stat) {
+    this->Stat1<VType, NumType>::operator+=(stat);
+    if (stat.max_ > max_) max_ = stat.max_;
+    if (stat.min_ < min_) min_ = stat.min_;
+    return (*this);
+  }
+  // Multiply the weight of the set of samples by a factor k
+  Self &operator*=(const VType &stat) {
+    this->Stat1<VType, NumType>::operator*=(stat);
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  Self operator+(const Self &stat) const { return Self(*this) += stat; }
+  // Multiply the weight of the set of samples by a factor k
+  Self operator*(const VType &k) const { return Self(*this) *= k; }
+
+  // Return the maximal value in this sample set
+  VType Max() const { return max_; }
+  // Return the minimal value in this sample set
+  VType Min() const { return min_; }
+
+ private:
+  // The - operation makes no sense with Min/Max
+  // unless we keep the full list of values (but we don't)
+  // make it private, and let it undefined so nobody can call it
+  Self &operator-=(const Self &stat);  // senseless. let it undefined.
+
+  // The operation opposite to -
+  Self operator-(const Self &stat) const;  // senseless. let it undefined.
+
+  // Let i be the index of the samples provided (using +=)
+  // and weight[i],value[i] be the data of sample #i
+  // then the variables have the following meaning:
+  VType max_;  // max of value[i]
+  VType min_;  // min of value[i]
+};
+
+// Useful printing function
+template <typename VType, typename NumType>
+std::ostream &operator<<(std::ostream &out,
+                         const Stat1MinMax<VType, NumType> &s) {
+  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
+      << " nsamples = " << s.NumSamples() << " min = " << s.Min()
+      << " max = " << s.Max() << "}";
+  return out;
+}
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_STAT_H_
Index: utils/google-benchmark/src/string_util.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/string_util.h
@@ -0,0 +1,44 @@
+#ifndef BENCHMARK_STRING_UTIL_H_
+#define BENCHMARK_STRING_UTIL_H_
+
+#include <string>
+#include <sstream>
+#include <utility>
+#include "internal_macros.h"
+
+namespace benchmark {
+
+void AppendHumanReadable(int n, std::string* str);
+
+std::string HumanReadableNumber(double n);
+
+std::string StringPrintF(const char* format, ...);
+
+inline std::ostream&
+StringCatImp(std::ostream& out) BENCHMARK_NOEXCEPT
+{
+  return out;
+}
+
+template <class First, class ...Rest>
+inline std::ostream&
+StringCatImp(std::ostream& out, First&& f, Rest&&... rest)
+{
+  out << std::forward<First>(f);
+  return StringCatImp(out, std::forward<Rest>(rest)...);
+}
+
+template<class ...Args>
+inline std::string StrCat(Args&&... args)
+{
+  std::ostringstream ss;
+  StringCatImp(ss, std::forward<Args>(args)...);
+  return ss.str();
+}
+
+void ReplaceAll(std::string* str, const std::string& from,
+                const std::string& to);
+
+} // end namespace benchmark
+
+#endif // BENCHMARK_STRING_UTIL_H_
Index: utils/google-benchmark/src/string_util.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/string_util.cc
@@ -0,0 +1,169 @@
+#include "string_util.h"
+
+#include <cmath>
+#include <cstdarg>
+#include <array>
+#include <memory>
+#include <sstream>
+#include <stdio.h>
+
+#include "arraysize.h"
+
+namespace benchmark {
+namespace {
+
+// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
+const char kBigSIUnits[] = "kMGTPEZY";
+// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
+const char kBigIECUnits[] = "KMGTPEZY";
+// milli, micro, nano, pico, femto, atto, zepto, yocto.
+const char kSmallSIUnits[] = "munpfazy";
+
+// We require that all three arrays have the same size.
+static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
+              "SI and IEC unit arrays must be the same size");
+static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
+              "Small SI and Big SI unit arrays must be the same size");
+
+static const int64_t kUnitsSize = arraysize(kBigSIUnits);
+
+} // end anonymous namespace
+
+void ToExponentAndMantissa(double val, double thresh, int precision,
+                           double one_k, std::string* mantissa,
+                           int64_t* exponent) {
+  std::stringstream mantissa_stream;
+
+  if (val < 0) {
+    mantissa_stream << "-";
+    val = -val;
+  }
+
+  // Adjust threshold so that it never excludes things which can't be rendered
+  // in 'precision' digits.
+  const double adjusted_threshold =
+      std::max(thresh, 1.0 / std::pow(10.0, precision));
+  const double big_threshold = adjusted_threshold * one_k;
+  const double small_threshold = adjusted_threshold;
+
+  if (val > big_threshold) {
+    // Positive powers
+    double scaled = val;
+    for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) {
+      scaled /= one_k;
+      if (scaled <= big_threshold) {
+        mantissa_stream << scaled;
+        *exponent = i + 1;
+        *mantissa = mantissa_stream.str();
+        return;
+      }
+    }
+    mantissa_stream << val;
+    *exponent = 0;
+  } else if (val < small_threshold) {
+    // Negative powers
+    double scaled = val;
+    for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
+      scaled *= one_k;
+      if (scaled >= small_threshold) {
+        mantissa_stream << scaled;
+        *exponent = -static_cast<int64_t>(i + 1);
+        *mantissa = mantissa_stream.str();
+        return;
+      }
+    }
+    mantissa_stream << val;
+    *exponent = 0;
+  } else {
+    mantissa_stream << val;
+    *exponent = 0;
+  }
+  *mantissa = mantissa_stream.str();
+}
+
+std::string ExponentToPrefix(int64_t exponent, bool iec) {
+  if (exponent == 0) return "";
+
+  const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
+  if (index >= kUnitsSize) return "";
+
+  const char* array =
+      (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
+  if (iec)
+    return array[index] + std::string("i");
+  else
+    return std::string(1, array[index]);
+}
+
+std::string ToBinaryStringFullySpecified(double value, double threshold,
+                                         int precision) {
+  std::string mantissa;
+  int64_t exponent;
+  ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa,
+                        &exponent);
+  return mantissa + ExponentToPrefix(exponent, false);
+}
+
+void AppendHumanReadable(int n, std::string* str) {
+  std::stringstream ss;
+  // Round down to the nearest SI prefix.
+  ss << "/" << ToBinaryStringFullySpecified(n, 1.0, 0);
+  *str += ss.str();
+}
+
+std::string HumanReadableNumber(double n) {
+  // 1.1 means that figures up to 1.1k should be shown with the next unit down;
+  // this softens edge effects.
+  // 1 means that we should show one decimal place of precision.
+  return ToBinaryStringFullySpecified(n, 1.1, 1);
+}
+
+std::string StringPrintFImp(const char *msg, va_list args)
+{
+  // we might need a second shot at this, so pre-emptivly make a copy
+  va_list args_cp;
+  va_copy(args_cp, args);
+
+  // TODO(ericwf): use std::array for first attempt to avoid one memory
+  // allocation guess what the size might be
+  std::array<char, 256> local_buff;
+  std::size_t size = local_buff.size();
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation in the android-ndk
+  auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
+
+  va_end(args_cp);
+
+  // handle empty expansion
+  if (ret == 0)
+    return std::string{};
+  if (static_cast<std::size_t>(ret) < size)
+    return std::string(local_buff.data());
+
+  // we did not provide a long enough buffer on our first attempt.
+  // add 1 to size to account for null-byte in size cast to prevent overflow
+  size = static_cast<std::size_t>(ret) + 1;
+  auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation in the android-ndk
+  ret = vsnprintf(buff_ptr.get(), size, msg, args);
+  return std::string(buff_ptr.get());
+}
+
+std::string StringPrintF(const char* format, ...)
+{
+  va_list args;
+  va_start(args, format);
+  std::string tmp = StringPrintFImp(format, args);
+  va_end(args);
+  return tmp;
+}
+
+void ReplaceAll(std::string* str, const std::string& from,
+                const std::string& to) {
+  std::size_t start = 0;
+  while((start = str->find(from, start)) != std::string::npos) {
+    str->replace(start, from.length(), to);
+    start += to.length();
+  }
+}
+
+} // end namespace benchmark
Index: utils/google-benchmark/src/sysinfo.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/sysinfo.h
@@ -0,0 +1,12 @@
+#ifndef BENCHMARK_SYSINFO_H_
+#define BENCHMARK_SYSINFO_H_
+
+namespace benchmark {
+double MyCPUUsage();
+double ChildrenCPUUsage();
+int NumCPUs();
+double CyclesPerSecond();
+bool CpuScalingEnabled();
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_SYSINFO_H_
Index: utils/google-benchmark/src/sysinfo.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/sysinfo.cc
@@ -0,0 +1,420 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sysinfo.h"
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <Shlwapi.h>
+#include <Windows.h>
+#include <VersionHelpers.h>
+#else
+#include <fcntl.h>
+#include <sys/resource.h>
+#include <sys/types.h> // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#include <sys/time.h>
+#include <unistd.h>
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#include <sys/sysctl.h>
+#endif
+#endif
+
+#include <cerrno>
+#include <cstdio>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <mutex>
+
+#include "arraysize.h"
+#include "check.h"
+#include "cycleclock.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "sleep.h"
+#include "string_util.h"
+
+namespace benchmark {
+namespace {
+std::once_flag cpuinfo_init;
+double cpuinfo_cycles_per_second = 1.0;
+int cpuinfo_num_cpus = 1;  // Conservative guess
+std::mutex cputimens_mutex;
+
+#if !defined BENCHMARK_OS_MACOSX
+const int64_t estimate_time_ms = 1000;
+
+// Helper function estimates cycles/sec by observing cycles elapsed during
+// sleep(). Using small sleep time decreases accuracy significantly.
+int64_t EstimateCyclesPerSecond() {
+  const int64_t start_ticks = cycleclock::Now();
+  SleepForMilliseconds(estimate_time_ms);
+  return cycleclock::Now() - start_ticks;
+}
+#endif
+
+#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
+// Helper function for reading an int from a file. Returns true if successful
+// and the memory location pointed to by value is set to the value read.
+bool ReadIntFromFile(const char* file, long* value) {
+  bool ret = false;
+  int fd = open(file, O_RDONLY);
+  if (fd != -1) {
+    char line[1024];
+    char* err;
+    memset(line, '\0', sizeof(line));
+    CHECK(read(fd, line, sizeof(line) - 1));
+    const long temp_value = strtol(line, &err, 10);
+    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
+      *value = temp_value;
+      ret = true;
+    }
+    close(fd);
+  }
+  return ret;
+}
+#endif
+
+void InitializeSystemInfo() {
+#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
+  char line[1024];
+  char* err;
+  long freq;
+
+  bool saw_mhz = false;
+
+  // If the kernel is exporting the tsc frequency use that. There are issues
+  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
+  // exporintg an invalid p-state (on x86) or p-states may be used to put the
+  // processor in a new mode (turbo mode). Essentially, those frequencies
+  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
+  // well.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
+    // The value is in kHz (as the file name suggests).  For example, on a
+    // 2GHz warpstation, the file contains the value "2000000".
+    cpuinfo_cycles_per_second = freq * 1000.0;
+    saw_mhz = true;
+  }
+
+  // If CPU scaling is in effect, we want to use the *maximum* frequency,
+  // not whatever CPU speed some random processor happens to be using now.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
+    // The value is in kHz.  For example, on a 2GHz warpstation, the file
+    // contains the value "2000000".
+    cpuinfo_cycles_per_second = freq * 1000.0;
+    saw_mhz = true;
+  }
+
+  // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
+  const char* pname = "/proc/cpuinfo";
+  int fd = open(pname, O_RDONLY);
+  if (fd == -1) {
+    perror(pname);
+    if (!saw_mhz) {
+      cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+    }
+    return;
+  }
+
+  double bogo_clock = 1.0;
+  bool saw_bogo = false;
+  long max_cpu_id = 0;
+  int num_cpus = 0;
+  line[0] = line[1] = '\0';
+  size_t chars_read = 0;
+  do {  // we'll exit when the last read didn't read anything
+    // Move the next line to the beginning of the buffer
+    const size_t oldlinelen = strlen(line);
+    if (sizeof(line) == oldlinelen + 1)  // oldlinelen took up entire line
+      line[0] = '\0';
+    else  // still other lines left to save
+      memmove(line, line + oldlinelen + 1, sizeof(line) - (oldlinelen + 1));
+    // Terminate the new line, reading more if we can't find the newline
+    char* newline = strchr(line, '\n');
+    if (newline == nullptr) {
+      const size_t linelen = strlen(line);
+      const size_t bytes_to_read = sizeof(line) - 1 - linelen;
+      CHECK(bytes_to_read > 0);  // because the memmove recovered >=1 bytes
+      chars_read = read(fd, line + linelen, bytes_to_read);
+      line[linelen + chars_read] = '\0';
+      newline = strchr(line, '\n');
+    }
+    if (newline != nullptr) *newline = '\0';
+
+    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
+    // accept postive values. Some environments (virtual machines) report zero,
+    // which would cause infinite looping in WallTime_Init.
+    if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz") - 1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        cpuinfo_cycles_per_second = strtod(freqstr + 1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
+          saw_mhz = true;
+      }
+    } else if (strncasecmp(line, "bogomips", sizeof("bogomips") - 1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        bogo_clock = strtod(freqstr + 1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
+          saw_bogo = true;
+      }
+    } else if (strncmp(line, "processor", sizeof("processor") - 1) == 0) {
+      // The above comparison is case-sensitive because ARM kernels often
+      // include a "Processor" line that tells you about the CPU, distinct
+      // from the usual "processor" lines that give you CPU ids. No current
+      // Linux architecture is using "Processor" for CPU ids.
+      num_cpus++;  // count up every time we see an "processor :" entry
+      const char* id_str = strchr(line, ':');
+      if (id_str) {
+        const long cpu_id = strtol(id_str + 1, &err, 10);
+        if (id_str[1] != '\0' && *err == '\0' && max_cpu_id < cpu_id)
+          max_cpu_id = cpu_id;
+      }
+    }
+  } while (chars_read > 0);
+  close(fd);
+
+  if (!saw_mhz) {
+    if (saw_bogo) {
+      // If we didn't find anything better, we'll use bogomips, but
+      // we're not happy about it.
+      cpuinfo_cycles_per_second = bogo_clock;
+    } else {
+      // If we don't even have bogomips, we'll use the slow estimation.
+      cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+    }
+  }
+  if (num_cpus == 0) {
+    fprintf(stderr, "Failed to read num. CPUs correctly from /proc/cpuinfo\n");
+  } else {
+    if ((max_cpu_id + 1) != num_cpus) {
+      fprintf(stderr,
+              "CPU ID assignments in /proc/cpuinfo seem messed up."
+              " This is usually caused by a bad BIOS.\n");
+    }
+    cpuinfo_num_cpus = num_cpus;
+  }
+
+#elif defined BENCHMARK_OS_FREEBSD
+// For this sysctl to work, the machine must be configured without
+// SMP, APIC, or APM support.  hz should be 64-bit in freebsd 7.0
+// and later.  Before that, it's a 32-bit quantity (and gives the
+// wrong answer on machines faster than 2^32 Hz).  See
+//  http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html
+// But also compare FreeBSD 7.0:
+//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223
+//  231         error = sysctl_handle_quad(oidp, &freq, 0, req);
+// To FreeBSD 6.3 (it's the same in 6-STABLE):
+//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131
+//  139         error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
+#if __FreeBSD__ >= 7
+  uint64_t hz = 0;
+#else
+  unsigned int hz = 0;
+#endif
+  size_t sz = sizeof(hz);
+  const char* sysctl_path = "machdep.tsc_freq";
+  if (sysctlbyname(sysctl_path, &hz, &sz, nullptr, 0) != 0) {
+    fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
+            sysctl_path, strerror(errno));
+    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+  } else {
+    cpuinfo_cycles_per_second = hz;
+  }
+// TODO: also figure out cpuinfo_num_cpus
+
+#elif defined BENCHMARK_OS_WINDOWS
+  // In NT, read MHz from the registry. If we fail to do so or we're in win9x
+  // then make a crude estimate.
+  DWORD data, data_size = sizeof(data);
+  if (IsWindowsXPOrGreater() &&
+      SUCCEEDED(
+          SHGetValueA(HKEY_LOCAL_MACHINE,
+                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                      "~MHz", nullptr, &data, &data_size)))
+    cpuinfo_cycles_per_second = static_cast<double>((int64_t)data * (int64_t)(1000 * 1000));  // was mhz
+  else
+    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+// TODO: also figure out cpuinfo_num_cpus
+
+#elif defined BENCHMARK_OS_MACOSX
+  // returning "mach time units" per second. the current number of elapsed
+  // mach time units can be found by calling uint64 mach_absolute_time();
+  // while not as precise as actual CPU cycles, it is accurate in the face
+  // of CPU frequency scaling and multi-cpu/core machines.
+  // Our mac users have these types of machines, and accuracy
+  // (i.e. correctness) trumps precision.
+  // See cycleclock.h: CycleClock::Now(), which returns number of mach time
+  // units on Mac OS X.
+  mach_timebase_info_data_t timebase_info;
+  mach_timebase_info(&timebase_info);
+  double mach_time_units_per_nanosecond =
+      static_cast<double>(timebase_info.denom) /
+      static_cast<double>(timebase_info.numer);
+  cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9;
+
+  int num_cpus = 0;
+  size_t size = sizeof(num_cpus);
+  int numcpus_name[] = {CTL_HW, HW_NCPU};
+  if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, nullptr, 0) ==
+          0 &&
+      (size == sizeof(num_cpus)))
+    cpuinfo_num_cpus = num_cpus;
+
+#else
+  // Generic cycles per second counter
+  cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
+#endif
+}
+}  // end namespace
+
+// getrusage() based implementation of MyCPUUsage
+static double MyCPUUsageRUsage() {
+#ifndef BENCHMARK_OS_WINDOWS
+  struct rusage ru;
+  if (getrusage(RUSAGE_SELF, &ru) == 0) {
+    return (static_cast<double>(ru.ru_utime.tv_sec) +
+            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
+            static_cast<double>(ru.ru_stime.tv_sec) +
+            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+  } else {
+    return 0.0;
+  }
+#else
+  HANDLE proc = GetCurrentProcess();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  ULARGE_INTEGER kernel;
+  ULARGE_INTEGER user;
+  GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time, &user_time);
+  kernel.HighPart = kernel_time.dwHighDateTime;
+  kernel.LowPart = kernel_time.dwLowDateTime;
+  user.HighPart = user_time.dwHighDateTime;
+  user.LowPart = user_time.dwLowDateTime;
+  return (static_cast<double>(kernel.QuadPart) +
+          static_cast<double>(user.QuadPart)) * 1e-7;
+#endif  // OS_WINDOWS
+}
+
+#ifndef BENCHMARK_OS_WINDOWS
+static bool MyCPUUsageCPUTimeNsLocked(double* cputime) {
+  static int cputime_fd = -1;
+  if (cputime_fd == -1) {
+    cputime_fd = open("/proc/self/cputime_ns", O_RDONLY);
+    if (cputime_fd < 0) {
+      cputime_fd = -1;
+      return false;
+    }
+  }
+  char buff[64];
+  memset(buff, 0, sizeof(buff));
+  if (pread(cputime_fd, buff, sizeof(buff) - 1, 0) <= 0) {
+    close(cputime_fd);
+    cputime_fd = -1;
+    return false;
+  }
+  unsigned long long result = strtoull(buff, nullptr, 0);
+  if (result == (std::numeric_limits<unsigned long long>::max)()) {
+    close(cputime_fd);
+    cputime_fd = -1;
+    return false;
+  }
+  *cputime = static_cast<double>(result) / 1e9;
+  return true;
+}
+#endif  // OS_WINDOWS
+
+double MyCPUUsage() {
+#ifndef BENCHMARK_OS_WINDOWS
+  {
+    std::lock_guard<std::mutex> l(cputimens_mutex);
+    static bool use_cputime_ns = true;
+    if (use_cputime_ns) {
+      double value;
+      if (MyCPUUsageCPUTimeNsLocked(&value)) {
+        return value;
+      }
+      // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
+      VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
+      use_cputime_ns = false;
+    }
+  }
+#endif  // OS_WINDOWS
+  return MyCPUUsageRUsage();
+}
+
+double ChildrenCPUUsage() {
+#ifndef BENCHMARK_OS_WINDOWS
+  struct rusage ru;
+  if (getrusage(RUSAGE_CHILDREN, &ru) == 0) {
+    return (static_cast<double>(ru.ru_utime.tv_sec) +
+            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
+            static_cast<double>(ru.ru_stime.tv_sec) +
+            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+  } else {
+    return 0.0;
+  }
+#else
+  // TODO: Not sure what this even means on Windows
+  return 0.0;
+#endif  // OS_WINDOWS
+}
+
+double CyclesPerSecond(void) {
+  std::call_once(cpuinfo_init, InitializeSystemInfo);
+  return cpuinfo_cycles_per_second;
+}
+
+int NumCPUs(void) {
+  std::call_once(cpuinfo_init, InitializeSystemInfo);
+  return cpuinfo_num_cpus;
+}
+
+// The ""'s catch people who don't pass in a literal for "str"
+#define strliterallen(str) (sizeof("" str "") - 1)
+
+// Must use a string literal for prefix.
+#define memprefix(str, len, prefix)                       \
+  ((((len) >= strliterallen(prefix)) &&                   \
+    std::memcmp(str, prefix, strliterallen(prefix)) == 0) \
+       ? str + strliterallen(prefix)                      \
+       : nullptr)
+
+bool CpuScalingEnabled() {
+#ifndef BENCHMARK_OS_WINDOWS
+  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
+  // local file system. If reading the exported files fails, then we may not be
+  // running on Linux, so we silently ignore all the read errors.
+  for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
+    std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu,
+                                       "/cpufreq/scaling_governor");
+    FILE* file = fopen(governor_file.c_str(), "r");
+    if (!file) break;
+    char buff[16];
+    size_t bytes_read = fread(buff, 1, sizeof(buff), file);
+    fclose(file);
+    if (memprefix(buff, bytes_read, "performance") == nullptr) return true;
+  }
+#endif
+  return false;
+}
+
+}  // end namespace benchmark
Index: utils/google-benchmark/src/walltime.h
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/walltime.h
@@ -0,0 +1,17 @@
+#ifndef BENCHMARK_WALLTIME_H_
+#define BENCHMARK_WALLTIME_H_
+
+#include <string>
+
+namespace benchmark {
+typedef double WallTime;
+
+namespace walltime {
+WallTime Now();
+}  // end namespace walltime
+
+std::string LocalDateTimeString();
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_WALLTIME_H_
Index: utils/google-benchmark/src/walltime.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/src/walltime.cc
@@ -0,0 +1,263 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/macros.h"
+#include "internal_macros.h"
+#include "walltime.h"
+
+#if defined(BENCHMARK_OS_WINDOWS)
+#include <time.h>
+#include <winsock.h> // for timeval
+#else
+#include <sys/time.h>
+#endif
+
+#include <cstdio>
+#include <cstdint>
+#include <cstring>
+#include <ctime>
+
+#include <atomic>
+#include <chrono>
+#include <limits>
+
+#include "arraysize.h"
+#include "check.h"
+#include "cycleclock.h"
+#include "log.h"
+#include "sysinfo.h"
+
+namespace benchmark {
+namespace walltime {
+
+namespace {
+
+#if defined(HAVE_STEADY_CLOCK)
+template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
+struct ChooseSteadyClock {
+    typedef std::chrono::high_resolution_clock type;
+};
+
+template <>
+struct ChooseSteadyClock<false> {
+    typedef std::chrono::steady_clock type;
+};
+#endif
+
+struct ChooseClockType {
+#if defined(HAVE_STEADY_CLOCK)
+  typedef ChooseSteadyClock<>::type type;
+#else
+  typedef std::chrono::high_resolution_clock type;
+#endif
+};
+
+class WallTimeImp
+{
+public:
+  WallTime Now();
+
+  static WallTimeImp& GetWallTimeImp() {
+    static WallTimeImp* imp = new WallTimeImp();
+    return *imp;
+  }
+
+private:
+  WallTimeImp();
+  // Helper routines to load/store a float from an AtomicWord. Required because
+  // g++ < 4.7 doesn't support std::atomic<float> correctly. I cannot wait to
+  // get rid of this horror show.
+  void SetDrift(float f) {
+    int32_t w;
+    memcpy(&w, &f, sizeof(f));
+    std::atomic_store(&drift_adjust_, w);
+  }
+
+  float GetDrift() const {
+    float f;
+    int32_t w = std::atomic_load(&drift_adjust_);
+    memcpy(&f, &w, sizeof(f));
+    return f;
+  }
+
+  WallTime Slow() const {
+    struct timeval tv;
+#if defined(BENCHMARK_OS_WINDOWS)
+    FILETIME    file_time;
+    SYSTEMTIME  system_time;
+    ULARGE_INTEGER ularge;
+    const unsigned __int64 epoch = 116444736000000000LL;
+
+    GetSystemTime(&system_time);
+    SystemTimeToFileTime(&system_time, &file_time);
+    ularge.LowPart = file_time.dwLowDateTime;
+    ularge.HighPart = file_time.dwHighDateTime;
+
+    tv.tv_sec = (long)((ularge.QuadPart - epoch) / (10L * 1000 * 1000));
+    tv.tv_usec = (long)(system_time.wMilliseconds * 1000);
+#else
+    gettimeofday(&tv, nullptr);
+#endif
+    return tv.tv_sec + tv.tv_usec * 1e-6;
+  }
+
+private:
+  static_assert(sizeof(float) <= sizeof(int32_t),
+               "type sizes don't allow the drift_adjust hack");
+
+  WallTime base_walltime_;
+  int64_t base_cycletime_;
+  int64_t cycles_per_second_;
+  double seconds_per_cycle_;
+  uint32_t last_adjust_time_;
+  std::atomic<int32_t> drift_adjust_;
+  int64_t max_interval_cycles_;
+
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(WallTimeImp);
+};
+
+
+WallTime WallTimeImp::Now() {
+  WallTime now = 0.0;
+  WallTime result = 0.0;
+  int64_t ct = 0;
+  uint32_t top_bits = 0;
+  do {
+    ct = cycleclock::Now();
+    int64_t cycle_delta = ct - base_cycletime_;
+    result = base_walltime_ + cycle_delta * seconds_per_cycle_;
+
+    top_bits = static_cast<uint32_t>(uint64_t(ct) >> 32);
+    // Recompute drift no more often than every 2^32 cycles.
+    // I.e., @2GHz, ~ every two seconds
+    if (top_bits == last_adjust_time_) {  // don't need to recompute drift
+      return result + GetDrift();
+    }
+
+    now = Slow();
+  } while (cycleclock::Now() - ct > max_interval_cycles_);
+  // We are now sure that "now" and "result" were produced within
+  // kMaxErrorInterval of one another.
+
+  SetDrift(static_cast<float>(now - result));
+  last_adjust_time_ = top_bits;
+  return now;
+}
+
+
+WallTimeImp::WallTimeImp()
+    : base_walltime_(0.0), base_cycletime_(0),
+      cycles_per_second_(0), seconds_per_cycle_(0.0),
+      last_adjust_time_(0), drift_adjust_(0),
+      max_interval_cycles_(0) {
+  const double kMaxErrorInterval = 100e-6;
+  cycles_per_second_ = static_cast<int64_t>(CyclesPerSecond());
+  CHECK(cycles_per_second_ != 0);
+  seconds_per_cycle_ = 1.0 / cycles_per_second_;
+  max_interval_cycles_ =
+      static_cast<int64_t>(cycles_per_second_ * kMaxErrorInterval);
+  do {
+    base_cycletime_ = cycleclock::Now();
+    base_walltime_ = Slow();
+  } while (cycleclock::Now() - base_cycletime_ > max_interval_cycles_);
+  // We are now sure that "base_walltime" and "base_cycletime" were produced
+  // within kMaxErrorInterval of one another.
+
+  SetDrift(0.0);
+  last_adjust_time_ = static_cast<uint32_t>(uint64_t(base_cycletime_) >> 32);
+}
+
+WallTime CPUWalltimeNow() {
+  static WallTimeImp& imp = WallTimeImp::GetWallTimeImp();
+  return imp.Now();
+}
+
+WallTime ChronoWalltimeNow() {
+  typedef ChooseClockType::type Clock;
+  typedef std::chrono::duration<WallTime, std::chrono::seconds::period>
+          FPSeconds;
+  static_assert(std::chrono::treat_as_floating_point<WallTime>::value,
+                "This type must be treated as a floating point type.");
+  auto now = Clock::now().time_since_epoch();
+  return std::chrono::duration_cast<FPSeconds>(now).count();
+}
+
+bool UseCpuCycleClock() {
+    bool useWallTime = !CpuScalingEnabled();
+    if (useWallTime) {
+        VLOG(1) << "Using the CPU cycle clock to provide walltime::Now().\n";
+    } else {
+        VLOG(1) << "Using std::chrono to provide walltime::Now().\n";
+    }
+    return useWallTime;
+}
+
+
+} // end anonymous namespace
+
+// WallTimeImp doesn't work when CPU Scaling is enabled. If CPU Scaling is
+// enabled at the start of the program then std::chrono::system_clock is used
+// instead.
+WallTime Now()
+{
+  static bool useCPUClock = UseCpuCycleClock();
+  if (useCPUClock) {
+    return CPUWalltimeNow();
+  } else {
+    return ChronoWalltimeNow();
+  }
+}
+
+}  // end namespace walltime
+
+
+namespace {
+
+std::string DateTimeString(bool local) {
+  typedef std::chrono::system_clock Clock;
+  std::time_t now = Clock::to_time_t(Clock::now());
+  char storage[128];
+  std::size_t written;
+
+  if (local) {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written = std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::localtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  } else {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::gmtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  }
+  CHECK(written < arraysize(storage));
+  ((void)written); // prevent unused variable in optimized mode.
+  return std::string(storage);
+}
+
+} // end namespace
+
+std::string LocalDateTimeString() {
+  return DateTimeString(true);
+}
+
+}  // end namespace benchmark
Index: utils/google-benchmark/test/CMakeLists.txt
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/CMakeLists.txt
@@ -0,0 +1,111 @@
+# Enable the tests
+
+find_package(Threads REQUIRED)
+
+macro(compile_benchmark_test name)
+  add_executable(${name} "${name}.cc")
+  target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
+endmacro(compile_benchmark_test)
+
+# Demonstration executable
+compile_benchmark_test(benchmark_test)
+add_test(benchmark benchmark_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(filter_test)
+macro(add_filter_test name filter expect)
+  add_test(${name} filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
+  add_test(${name}_list_only filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
+endmacro(add_filter_test)
+
+add_filter_test(filter_simple "Foo" 3)
+add_filter_test(filter_suffix "BM_.*" 4)
+add_filter_test(filter_regex_all ".*" 5)
+add_filter_test(filter_regex_blank "" 5)
+add_filter_test(filter_regex_none "monkey" 0)
+add_filter_test(filter_regex_wildcard ".*Foo.*" 3)
+add_filter_test(filter_regex_begin "^BM_.*" 4)
+add_filter_test(filter_regex_begin2 "^N" 1)
+add_filter_test(filter_regex_end ".*Ba$" 1)
+
+compile_benchmark_test(options_test)
+add_test(options_benchmarks options_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(basic_test)
+add_test(basic_benchmark basic_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(diagnostics_test)
+add_test(diagnostics_test diagnostics_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(skip_with_error_test)
+add_test(skip_with_error_test skip_with_error_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(donotoptimize_test)
+add_test(donotoptimize_test donotoptimize_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(fixture_test)
+add_test(fixture_test fixture_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(map_test)
+add_test(map_test map_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(reporter_output_test)
+add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
+
+check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
+if (BENCHMARK_HAS_CXX03_FLAG)
+  set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-std=c++11" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
+  string(REPLACE "-std=c++0x" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
+
+  compile_benchmark_test(cxx03_test)
+  set_target_properties(cxx03_test
+      PROPERTIES COMPILE_FLAGS "${CXX03_FLAGS}")
+  add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
+endif()
+
+compile_benchmark_test(complexity_test)
+add_test(complexity_benchmark complexity_test --benchmark_min_time=0.01)
+
+# Add the coverage command(s)
+if(CMAKE_BUILD_TYPE)
+  string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
+endif()
+if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
+  find_program(GCOV gcov)
+  find_program(LCOV lcov)
+  find_program(GENHTML genhtml)
+  find_program(CTEST ctest)
+  if (GCOV AND LCOV AND GENHTML AND CTEST AND HAVE_CXX_FLAG_COVERAGE)
+    add_custom_command(
+      OUTPUT ${CMAKE_BINARY_DIR}/lcov/index.html
+      COMMAND ${LCOV} -q -z -d .
+      COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o before.lcov -i
+      COMMAND ${CTEST} --force-new-ctest-process
+      COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o after.lcov
+      COMMAND ${LCOV} -q -a before.lcov -a after.lcov --output-file final.lcov
+      COMMAND ${LCOV} -q -r final.lcov "'${CMAKE_SOURCE_DIR}/test/*'" -o final.lcov
+      COMMAND ${GENHTML} final.lcov -o lcov --demangle-cpp --sort -p "${CMAKE_BINARY_DIR}" -t benchmark
+      DEPENDS filter_test benchmark_test options_test basic_test fixture_test cxx03_test complexity_test
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+      COMMENT "Running LCOV"
+    )
+    add_custom_target(coverage
+      DEPENDS ${CMAKE_BINARY_DIR}/lcov/index.html
+      COMMENT "LCOV report at lcov/index.html"
+    )
+    message(STATUS "Coverage command added")
+  else()
+    if (HAVE_CXX_FLAG_COVERAGE)
+      set(CXX_FLAG_COVERAGE_MESSAGE supported)
+    else()
+      set(CXX_FLAG_COVERAGE_MESSAGE unavailable)
+    endif()
+    message(WARNING
+      "Coverage not available:\n"
+      "  gcov: ${GCOV}\n"
+      "  lcov: ${LCOV}\n"
+      "  genhtml: ${GENHTML}\n"
+      "  ctest: ${CTEST}\n"
+      "  --coverage flag: ${CXX_FLAG_COVERAGE_MESSAGE}")
+  endif()
+endif()
Index: utils/google-benchmark/test/basic_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/basic_test.cc
@@ -0,0 +1,102 @@
+
+#include "benchmark/benchmark_api.h"
+
+#define BASIC_BENCHMARK_TEST(x) \
+    BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
+
+void BM_empty(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
+BENCHMARK(BM_empty)->ThreadPerCpu();
+
+void BM_spin_empty(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    for (int x = 0; x < state.range_x(); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_empty);
+BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
+
+void BM_spin_pause_before(benchmark::State& state) {
+  for (int i = 0; i < state.range_x(); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
+
+
+void BM_spin_pause_during(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    state.PauseTiming();
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+    state.ResumeTiming();
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_during);
+BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
+
+void BM_pause_during(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    state.PauseTiming();
+    state.ResumeTiming();
+  }
+}
+BENCHMARK(BM_pause_during);
+BENCHMARK(BM_pause_during)->ThreadPerCpu();
+BENCHMARK(BM_pause_during)->UseRealTime();
+BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
+
+void BM_spin_pause_after(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+  for (int i = 0; i < state.range_x(); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
+
+
+void BM_spin_pause_before_and_after(benchmark::State& state) {
+  for (int i = 0; i < state.range_x(); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+  for (int i = 0; i < state.range_x(); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
+
+
+void BM_empty_stop_start(benchmark::State& state) {
+  while (state.KeepRunning()) { }
+}
+BENCHMARK(BM_empty_stop_start);
+BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
+
+BENCHMARK_MAIN()
Index: utils/google-benchmark/test/benchmark_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/benchmark_test.cc
@@ -0,0 +1,224 @@
+#include "benchmark/benchmark.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <list>
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <chrono>
+#include <thread>
+#include <utility>
+
+#if defined(__GNUC__)
+# define BENCHMARK_NOINLINE __attribute__((noinline))
+#else
+# define BENCHMARK_NOINLINE
+#endif
+
+namespace {
+
+int BENCHMARK_NOINLINE Factorial(uint32_t n) {
+  return (n == 1) ? 1 : n * Factorial(n - 1);
+}
+
+double CalculatePi(int depth) {
+  double pi = 0.0;
+  for (int i = 0; i < depth; ++i) {
+    double numerator = static_cast<double>(((i % 2) * 2) - 1);
+    double denominator = static_cast<double>((2 * i) - 1);
+    pi += numerator / denominator;
+  }
+  return (pi - 1.0) * 4;
+}
+
+std::set<int> ConstructRandomSet(int size) {
+  std::set<int> s;
+  for (int i = 0; i < size; ++i)
+    s.insert(i);
+  return s;
+}
+
+std::mutex test_vector_mu;
+std::vector<int>* test_vector = nullptr;
+
+}  // end namespace
+
+static void BM_Factorial(benchmark::State& state) {
+  int fac_42 = 0;
+  while (state.KeepRunning())
+    fac_42 = Factorial(8);
+  // Prevent compiler optimizations
+  std::stringstream ss;
+  ss << fac_42;
+  state.SetLabel(ss.str());
+}
+BENCHMARK(BM_Factorial);
+BENCHMARK(BM_Factorial)->UseRealTime();
+
+static void BM_CalculatePiRange(benchmark::State& state) {
+  double pi = 0.0;
+  while (state.KeepRunning())
+    pi = CalculatePi(state.range_x());
+  std::stringstream ss;
+  ss << pi;
+  state.SetLabel(ss.str());
+}
+BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
+
+static void BM_CalculatePi(benchmark::State& state) {
+  static const int depth = 1024;
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(CalculatePi(depth));
+  }
+}
+BENCHMARK(BM_CalculatePi)->Threads(8);
+BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32);
+BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
+
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    std::set<int> data = ConstructRandomSet(state.range_x());
+    state.ResumeTiming();
+    for (int j = 0; j < state.range_y(); ++j)
+      data.insert(rand());
+  }
+  state.SetItemsProcessed(state.iterations() * state.range_y());
+  state.SetBytesProcessed(state.iterations() * state.range_y() * sizeof(int));
+}
+BENCHMARK(BM_SetInsert)->RangePair(1<<10,8<<10, 1,10);
+
+template<typename Container, typename ValueType = typename Container::value_type>
+static void BM_Sequential(benchmark::State& state) {
+  ValueType v = 42;
+  while (state.KeepRunning()) {
+    Container c;
+    for (int i = state.range_x(); --i; )
+      c.push_back(v);
+  }
+  const size_t items_processed = state.iterations() * state.range_x();
+  state.SetItemsProcessed(items_processed);
+  state.SetBytesProcessed(items_processed * sizeof(v));
+}
+BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)->Range(1 << 0, 1 << 10);
+BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
+// Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond.
+#if __cplusplus >= 201103L
+BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>, int)->Arg(512);
+#endif
+
+static void BM_StringCompare(benchmark::State& state) {
+  std::string s1(state.range_x(), '-');
+  std::string s2(state.range_x(), '-');
+  while (state.KeepRunning())
+    benchmark::DoNotOptimize(s1.compare(s2));
+}
+BENCHMARK(BM_StringCompare)->Range(1, 1<<20);
+
+static void BM_SetupTeardown(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // No need to lock test_vector_mu here as this is running single-threaded.
+    test_vector = new std::vector<int>();
+  }
+  int i = 0;
+  while (state.KeepRunning()) {
+    std::lock_guard<std::mutex> l(test_vector_mu);
+    if (i%2 == 0)
+      test_vector->push_back(i);
+    else
+      test_vector->pop_back();
+    ++i;
+  }
+  if (state.thread_index == 0) {
+    delete test_vector;
+  }
+}
+BENCHMARK(BM_SetupTeardown)->ThreadPerCpu();
+
+static void BM_LongTest(benchmark::State& state) {
+  double tracker = 0.0;
+  while (state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i)
+      benchmark::DoNotOptimize(tracker += i);
+  }
+}
+BENCHMARK(BM_LongTest)->Range(1<<16,1<<28);
+
+static void BM_ParallelMemset(benchmark::State& state) {
+  int size = state.range_x() / sizeof(int);
+  int thread_size = size / state.threads;
+  int from = thread_size * state.thread_index;
+  int to = from + thread_size;
+
+  if (state.thread_index == 0) {
+    test_vector = new std::vector<int>(size);
+  }
+
+  while (state.KeepRunning()) {
+    for (int i = from; i < to; i++) {
+      // No need to lock test_vector_mu as ranges
+      // do not overlap between threads.
+      benchmark::DoNotOptimize(test_vector->at(i) = 1);
+    }
+  }
+
+  if (state.thread_index == 0) {
+    delete test_vector;
+  }
+}
+BENCHMARK(BM_ParallelMemset)->Arg(10 << 20)->ThreadRange(1, 4);
+
+static void BM_ManualTiming(benchmark::State& state) {
+  size_t slept_for = 0;
+  int microseconds = state.range_x();
+  std::chrono::duration<double, std::micro> sleep_duration {
+    static_cast<double>(microseconds)
+  };
+
+  while (state.KeepRunning()) {
+    auto start   = std::chrono::high_resolution_clock::now();
+    // Simulate some useful workload with a sleep
+    std::this_thread::sleep_for(std::chrono::duration_cast<
+      std::chrono::nanoseconds>(sleep_duration));
+    auto end     = std::chrono::high_resolution_clock::now();
+
+    auto elapsed =
+      std::chrono::duration_cast<std::chrono::duration<double>>(
+        end - start);
+
+    state.SetIterationTime(elapsed.count());
+    slept_for += microseconds;
+  }
+  state.SetItemsProcessed(slept_for);
+}
+BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseRealTime();
+BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseManualTime();
+
+#if __cplusplus >= 201103L
+
+template <class ...Args>
+void BM_with_args(benchmark::State& state, Args&&...) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK_CAPTURE(BM_with_args, int_test, 42, 43, 44);
+BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test,
+                  std::string("abc"), std::pair<int, double>(42, 3.8));
+
+void BM_non_template_args(benchmark::State& state, int, double) {
+  while(state.KeepRunning()) {}
+}
+BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
+
+#endif // __cplusplus >= 201103L
+
+BENCHMARK_MAIN()
+
Index: utils/google-benchmark/test/complexity_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/complexity_test.cc
@@ -0,0 +1,297 @@
+
+#undef NDEBUG
+#include "benchmark/benchmark.h"
+#include "../src/check.h" // NOTE: check.h is for internal use only!
+#include "../src/re.h"    // NOTE: re.h is for internal use only
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include <cmath>
+
+namespace {
+
+// ========================================================================= //
+// -------------------------- Testing Case --------------------------------- //
+// ========================================================================= //
+
+enum MatchRules {
+  MR_Default, // Skip non-matching lines until a match is found.
+  MR_Next    // Match must occur on the next line.
+};
+
+struct TestCase {
+  std::string regex;
+  int match_rule;
+
+  TestCase(std::string re, int rule = MR_Default) : regex(re), match_rule(rule) {}
+
+  void Check(std::stringstream& remaining_output) const {
+    benchmark::Regex r;
+    std::string err_str;
+    r.Init(regex, &err_str);
+    CHECK(err_str.empty()) << "Could not construct regex \"" << regex << "\""
+                           << " got Error: " << err_str;
+
+    std::string line;
+    while (remaining_output.eof() == false) {
+        CHECK(remaining_output.good());
+        std::getline(remaining_output, line);
+        if (r.Match(line)) return;
+        CHECK(match_rule != MR_Next) << "Expected line \"" << line
+                                     << "\" to match regex \"" << regex << "\"";
+    }
+
+    CHECK(remaining_output.eof() == false)
+        << "End of output reached before match for regex \"" << regex
+        << "\" was found";
+  }
+};
+
+std::vector<TestCase> ConsoleOutputTests;
+std::vector<TestCase> JSONOutputTests;
+std::vector<TestCase> CSVOutputTests;
+
+// ========================================================================= //
+// -------------------------- Test Helpers --------------------------------- //
+// ========================================================================= //
+
+class TestReporter : public benchmark::BenchmarkReporter {
+public:
+  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
+      : reporters_(reps)  {}
+
+  virtual bool ReportContext(const Context& context) {
+    bool last_ret = false;
+    bool first = true;
+    for (auto rep : reporters_) {
+      bool new_ret = rep->ReportContext(context);
+      CHECK(first || new_ret == last_ret)
+          << "Reports return different values for ReportContext";
+      first = false;
+      last_ret = new_ret;
+    }
+    return last_ret;
+  }
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    for (auto rep : reporters_)
+      rep->ReportRuns(report);
+  }
+
+  virtual void Finalize() {
+      for (auto rep : reporters_)
+        rep->Finalize();
+  }
+
+private:
+  std::vector<benchmark::BenchmarkReporter*> reporters_;
+};
+
+
+#define CONCAT2(x, y) x##y
+#define CONCAT(x, y) CONCAT2(x, y)
+
+#define ADD_CASES(...) \
+    int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+
+int AddCases(std::vector<TestCase>* out, std::initializer_list<TestCase> const& v) {
+  for (auto const& TC : v)
+    out->push_back(TC);
+  return 0;
+}
+
+template <class First>
+std::string join(First f) { return f; }
+
+template <class First, class ...Args>
+std::string join(First f, Args&&... args) {
+    return std::string(std::move(f)) + "[ ]+" + join(std::forward<Args>(args)...);
+}
+
+std::string dec_re = "[0-9]+\\.[0-9]+";
+
+#define ADD_COMPLEXITY_CASES(...) \
+    int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
+
+int AddComplexityTest(std::vector<TestCase>* console_out, std::vector<TestCase>* json_out,
+                      std::vector<TestCase>* csv_out, std::string big_o_test_name, 
+                      std::string rms_test_name, std::string big_o) {
+  std::string big_o_str = dec_re + " " + big_o;
+  AddCases(console_out, {
+    {join("^" + big_o_test_name + "", big_o_str, big_o_str) + "[ ]*$"},
+    {join("^" + rms_test_name + "", "[0-9]+ %", "[0-9]+ %") + "[ ]*$"}
+  });
+  AddCases(json_out, {
+    {"\"name\": \"" + big_o_test_name + "\",$"},
+    {"\"cpu_coefficient\": [0-9]+,$", MR_Next},
+    {"\"real_coefficient\": [0-9]{1,5},$", MR_Next},
+    {"\"big_o\": \"" + big_o + "\",$", MR_Next},
+    {"\"time_unit\": \"ns\"$", MR_Next},
+    {"}", MR_Next},
+    {"\"name\": \"" + rms_test_name + "\",$"},
+    {"\"rms\": [0-9]+%$", MR_Next},
+    {"}", MR_Next}
+  });
+  AddCases(csv_out, {
+    {"^\"" + big_o_test_name + "\",," + dec_re + "," + dec_re + "," + big_o + ",,,,,$"},
+    {"^\"" + rms_test_name + "\",," + dec_re + "," + dec_re + ",,,,,,$"}
+  });
+  return 0;
+}
+
+}  // end namespace
+
+// ========================================================================= //
+// --------------------------- Testing BigO O(1) --------------------------- //
+// ========================================================================= //
+
+void BM_Complexity_O1(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity(benchmark::o1);
+BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity([](int){return 1.0; });
+BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity();
+
+const char* big_o_1_test_name = "BM_Complexity_O1_BigO";
+const char* rms_o_1_test_name = "BM_Complexity_O1_RMS";
+const char* enum_auto_big_o_1 = "\\([0-9]+\\)";
+const char* lambda_big_o_1 = "f\\(N\\)";
+
+// Add enum tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_1_test_name, rms_o_1_test_name, enum_auto_big_o_1);
+
+// Add lambda tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1);
+
+// ========================================================================= //
+// --------------------------- Testing BigO O(N) --------------------------- //
+// ========================================================================= //
+
+std::vector<int> ConstructRandomVector(int size) {
+  std::vector<int> v;
+  v.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    v.push_back(rand() % size);
+  }
+  return v;
+}
+
+void BM_Complexity_O_N(benchmark::State& state) {
+  auto v = ConstructRandomVector(state.range_x());
+  const int item_not_in_vector = state.range_x()*2; // Test worst case scenario (item not in vector)
+  while (state.KeepRunning()) {
+      benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity(benchmark::oN);
+BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity([](int n) -> double{return n; });
+BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity();
+
+const char* big_o_n_test_name = "BM_Complexity_O_N_BigO";
+const char* rms_o_n_test_name = "BM_Complexity_O_N_RMS";
+const char* enum_auto_big_o_n = "N";
+const char* lambda_big_o_n = "f\\(N\\)";
+
+// Add enum tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n);
+
+// Add lambda tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n);
+
+// ========================================================================= //
+// ------------------------- Testing BigO O(N*lgN) ------------------------- //
+// ========================================================================= //
+
+static void BM_Complexity_O_N_log_N(benchmark::State& state) {
+  auto v = ConstructRandomVector(state.range_x());
+  while (state.KeepRunning()) {
+      std::sort(v.begin(), v.end());
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity(benchmark::oNLogN);
+BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity([](int n) {return n * std::log2(n); });
+BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity();
+
+const char* big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
+const char* rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
+const char* enum_auto_big_o_n_lg_n = "NlgN";
+const char* lambda_big_o_n_lg_n = "f\\(N\\)";
+
+// Add enum tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
+
+// Add lambda tests
+ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
+                     big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
+
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+
+int main(int argc, char* argv[]) {
+  // Add --color_print=false to argv since we don't want to match color codes.
+  char new_arg[64];
+  char* new_argv[64];
+  std::copy(argv, argv + argc, new_argv);
+  new_argv[argc++] = std::strcpy(new_arg, "--color_print=false");
+  benchmark::Initialize(&argc, new_argv);
+
+  benchmark::ConsoleReporter CR;
+  benchmark::JSONReporter JR;
+  benchmark::CSVReporter CSVR;
+  struct ReporterTest {
+    const char* name;
+    std::vector<TestCase>& output_cases;
+    benchmark::BenchmarkReporter& reporter;
+    std::stringstream out_stream;
+    std::stringstream err_stream;
+
+    ReporterTest(const char* n,
+                 std::vector<TestCase>& out_tc,
+                 benchmark::BenchmarkReporter& br)
+        : name(n), output_cases(out_tc), reporter(br) {
+        reporter.SetOutputStream(&out_stream);
+        reporter.SetErrorStream(&err_stream);
+    }
+  } TestCases[] = {
+      {"ConsoleReporter", ConsoleOutputTests, CR},
+      {"JSONReporter", JSONOutputTests, JR},
+      {"CSVReporter", CSVOutputTests, CSVR}
+  };
+
+  // Create the test reporter and run the benchmarks.
+  std::cout << "Running benchmarks...\n";
+  TestReporter test_rep({&CR, &JR, &CSVR});
+  benchmark::RunSpecifiedBenchmarks(&test_rep);
+
+  for (auto& rep_test : TestCases) {
+      std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+      std::string banner(msg.size() - 1, '-');
+      std::cout << banner << msg << banner << "\n";
+
+      std::cerr << rep_test.err_stream.str();
+      std::cout << rep_test.out_stream.str();
+
+      for (const auto& TC : rep_test.output_cases)
+        TC.Check(rep_test.out_stream);
+
+      std::cout << "\n";
+  }
+  return 0;
+}
+
Index: utils/google-benchmark/test/cxx03_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/cxx03_test.cc
@@ -0,0 +1,31 @@
+
+#include <cstddef>
+
+#include "benchmark/benchmark.h"
+
+#if __cplusplus >= 201103L
+#error C++11 or greater detected. Should be C++03.
+#endif
+
+void BM_empty(benchmark::State& state) {
+    while (state.KeepRunning()) {
+        volatile std::size_t x = state.iterations();
+        ((void)x);
+    }
+}
+BENCHMARK(BM_empty);
+
+template <class T, class U>
+void BM_template2(benchmark::State& state) {
+    BM_empty(state);
+}
+BENCHMARK_TEMPLATE2(BM_template2, int, long);
+
+template <class T>
+void BM_template1(benchmark::State& state) {
+    BM_empty(state);
+}
+BENCHMARK_TEMPLATE(BM_template1, long);
+BENCHMARK_TEMPLATE1(BM_template1, int);
+
+BENCHMARK_MAIN()
Index: utils/google-benchmark/test/diagnostics_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/diagnostics_test.cc
@@ -0,0 +1,61 @@
+// Testing:
+//   State::PauseTiming()
+//   State::ResumeTiming()
+// Test that CHECK's within these function diagnose when they are called
+// outside of the KeepRunning() loop.
+//
+// NOTE: Users should NOT include or use src/check.h. This is only done in
+// order to test library internals.
+
+#include "benchmark/benchmark_api.h"
+#include "../src/check.h"
+#include <stdexcept>
+#include <cstdlib>
+
+#if defined(__GNUC__) && !defined(__EXCEPTIONS)
+#define TEST_HAS_NO_EXCEPTIONS
+#endif
+
+void TestHandler() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  throw std::logic_error("");
+#else
+  std::abort();
+#endif
+}
+
+void try_invalid_pause_resume(benchmark::State& state) {
+#if !defined(NDEBUG) && !defined(TEST_HAS_NO_EXCEPTIONS)
+  try {
+    state.PauseTiming();
+    std::abort();
+  } catch (std::logic_error const&) {}
+  try {
+    state.ResumeTiming();
+    std::abort();
+  } catch (std::logic_error const&) {}
+#else
+  (void)state; // avoid unused warning
+#endif
+}
+
+void BM_diagnostic_test(benchmark::State& state) {
+  static bool called_once = false;
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  called_once = true;
+}
+BENCHMARK(BM_diagnostic_test);
+
+int main(int argc, char** argv) {
+  benchmark::internal::GetAbortHandler() = &TestHandler;
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+}
Index: utils/google-benchmark/test/donotoptimize_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/donotoptimize_test.cc
@@ -0,0 +1,36 @@
+#include "benchmark/benchmark.h"
+
+#include <cstdint>
+
+namespace {
+#if defined(__GNUC__)
+  std::uint64_t double_up(const std::uint64_t x) __attribute__ ((const));
+#endif
+  std::uint64_t double_up(const std::uint64_t x) {
+    return x * 2;
+  }
+}
+
+int main(int, char*[]) {
+
+  // this test verifies compilation of DoNotOptimize() for some types
+
+  char buffer8[8];
+  benchmark::DoNotOptimize(buffer8);
+
+  char buffer20[20];
+  benchmark::DoNotOptimize(buffer20);
+
+  char buffer1024[1024];
+  benchmark::DoNotOptimize(buffer1024);
+  benchmark::DoNotOptimize(&buffer1024[0]);
+
+  int x = 123;
+  benchmark::DoNotOptimize(x);
+  benchmark::DoNotOptimize(&x);
+  benchmark::DoNotOptimize(x += 42);
+
+  benchmark::DoNotOptimize(double_up(x));
+
+  return 0;
+}
Index: utils/google-benchmark/test/filter_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/filter_test.cc
@@ -0,0 +1,105 @@
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    ++count_;
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  TestReporter() : count_(0) {}
+
+  virtual ~TestReporter() {}
+
+  size_t GetCount() const {
+    return count_;
+  }
+
+ private:
+  mutable size_t count_;
+};
+
+}  // end namespace
+
+
+static void NoPrefix(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(NoPrefix);
+
+static void BM_Foo(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_Foo);
+
+
+static void BM_Bar(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_Bar);
+
+
+static void BM_FooBar(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_FooBar);
+
+
+static void BM_FooBa(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_FooBa);
+
+
+
+int main(int argc, char** argv) {
+  bool list_only = false;
+  for (int i=0; i < argc; ++i)
+    list_only |= std::string(argv[i]).find("--benchmark_list_tests") != std::string::npos;
+
+  benchmark::Initialize(&argc, argv);
+
+  TestReporter test_reporter;
+  const size_t returned_count = benchmark::RunSpecifiedBenchmarks(&test_reporter);
+
+  if (argc == 2) {
+    // Make sure we ran all of the tests
+    std::stringstream ss(argv[1]);
+    size_t expected_return;
+    ss >> expected_return;
+
+    if (returned_count != expected_return) {
+      std::cerr << "ERROR: Expected " << expected_return
+                << " tests to match the filter but returned_count = "
+                << returned_count << std::endl;
+      return -1;
+    }
+
+    const size_t expected_reports = list_only ? 0 : expected_return;
+    const size_t reports_count = test_reporter.GetCount();
+    if (reports_count != expected_reports) {
+      std::cerr << "ERROR: Expected " << expected_reports
+                << " tests to be run but reported_count = " << reports_count
+                << std::endl;
+      return -1;
+    }
+  }
+
+  return 0;
+}
Index: utils/google-benchmark/test/fixture_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/fixture_test.cc
@@ -0,0 +1,52 @@
+
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <memory>
+
+class MyFixture : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State& state) {
+    if (state.thread_index == 0) {
+      assert(data.get() == nullptr);
+      data.reset(new int(42));
+    }
+  }
+
+  void TearDown(const ::benchmark::State& state) {
+    if (state.thread_index == 0) {
+      assert(data.get() != nullptr);
+      data.reset();
+    }
+  }
+
+  ~MyFixture() {
+    assert(data == nullptr);
+  }
+
+  std::unique_ptr<int> data;
+};
+
+
+BENCHMARK_F(MyFixture, Foo)(benchmark::State& st) {
+  assert(data.get() != nullptr);
+  assert(*data == 42);
+  while (st.KeepRunning()) {
+  }
+}
+
+BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
+  if (st.thread_index == 0) {
+    assert(data.get() != nullptr);
+    assert(*data == 42);
+  }
+  while (st.KeepRunning()) {
+    assert(data.get() != nullptr);
+    assert(*data == 42);
+  }
+  st.SetItemsProcessed(st.range_x());
+}
+BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
+BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
+
+BENCHMARK_MAIN()
Index: utils/google-benchmark/test/map_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/map_test.cc
@@ -0,0 +1,58 @@
+#include "benchmark/benchmark.h"
+
+#include <cstdlib>
+#include <map>
+
+namespace {
+
+std::map<int, int> ConstructRandomMap(int size) {
+  std::map<int, int> m;
+  for (int i = 0; i < size; ++i) {
+    m.insert(std::make_pair(rand() % size, rand() % size));
+  }
+  return m;
+}
+
+}  // namespace
+
+// Basic version.
+static void BM_MapLookup(benchmark::State& state) {
+  const int size = state.range_x();
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    std::map<int, int> m = ConstructRandomMap(size);
+    state.ResumeTiming();
+    for (int i = 0; i < size; ++i) {
+      benchmark::DoNotOptimize(m.find(rand() % size));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * size);
+}
+BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12);
+
+// Using fixtures.
+class MapFixture : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State& st) {
+    m = ConstructRandomMap(st.range_x());
+  }
+
+  void TearDown(const ::benchmark::State&) {
+    m.clear();
+  }
+
+  std::map<int, int> m;
+};
+
+BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) {
+  const int size = state.range_x();
+  while (state.KeepRunning()) {
+    for (int i = 0; i < size; ++i) {
+      benchmark::DoNotOptimize(m.find(rand() % size));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * size);
+}
+BENCHMARK_REGISTER_F(MapFixture, Lookup)->Range(1<<3, 1<<12);
+
+BENCHMARK_MAIN()
Index: utils/google-benchmark/test/options_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/options_test.cc
@@ -0,0 +1,44 @@
+#include "benchmark/benchmark_api.h"
+
+#include <chrono>
+#include <thread>
+
+void BM_basic(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+}
+
+void BM_basic_slow(benchmark::State& state) {
+  std::chrono::milliseconds sleep_duration(state.range_x());
+  while (state.KeepRunning()) {
+    std::this_thread::sleep_for(
+      std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration)
+      );
+  }
+}
+
+BENCHMARK(BM_basic);
+BENCHMARK(BM_basic)->Arg(42);
+BENCHMARK(BM_basic_slow)->Arg(10)->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_basic_slow)->Arg(100)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_basic)->Range(1, 8);
+BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8);
+BENCHMARK(BM_basic)->DenseRange(10, 15);
+BENCHMARK(BM_basic)->ArgPair(42, 42);
+BENCHMARK(BM_basic)->RangePair(64, 512, 64, 512);
+BENCHMARK(BM_basic)->MinTime(0.7);
+BENCHMARK(BM_basic)->UseRealTime();
+BENCHMARK(BM_basic)->ThreadRange(2, 4);
+BENCHMARK(BM_basic)->ThreadPerCpu();
+BENCHMARK(BM_basic)->Repetitions(3);
+
+void CustomArgs(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i < 10; ++i) {
+    b->Arg(i);
+  }
+}
+
+BENCHMARK(BM_basic)->Apply(CustomArgs);
+
+BENCHMARK_MAIN()
Index: utils/google-benchmark/test/reporter_output_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/reporter_output_test.cc
@@ -0,0 +1,259 @@
+
+#undef NDEBUG
+#include "benchmark/benchmark.h"
+#include "../src/check.h" // NOTE: check.h is for internal use only!
+#include "../src/re.h" // NOTE: re.h is for internal use only
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <utility>
+
+namespace {
+
+// ========================================================================= //
+// -------------------------- Testing Case --------------------------------- //
+// ========================================================================= //
+
+enum MatchRules {
+  MR_Default, // Skip non-matching lines until a match is found.
+  MR_Next    // Match must occur on the next line.
+};
+
+struct TestCase {
+  std::string regex;
+  int match_rule;
+
+  TestCase(std::string re, int rule = MR_Default) : regex(re), match_rule(rule) {}
+
+  void Check(std::stringstream& remaining_output) const {
+    benchmark::Regex r;
+    std::string err_str;
+    r.Init(regex, &err_str);
+    CHECK(err_str.empty()) << "Could not construct regex \"" << regex << "\""
+                           << " got Error: " << err_str;
+
+    std::string line;
+    while (remaining_output.eof() == false) {
+        CHECK(remaining_output.good());
+        std::getline(remaining_output, line);
+        if (r.Match(line)) return;
+        CHECK(match_rule != MR_Next) << "Expected line \"" << line
+                                     << "\" to match regex \"" << regex << "\"";
+    }
+
+    CHECK(remaining_output.eof() == false)
+        << "End of output reached before match for regex \"" << regex
+        << "\" was found";
+  }
+};
+
+std::vector<TestCase> ConsoleOutputTests;
+std::vector<TestCase> JSONOutputTests;
+std::vector<TestCase> CSVOutputTests;
+
+std::vector<TestCase> ConsoleErrorTests;
+std::vector<TestCase> JSONErrorTests;
+std::vector<TestCase> CSVErrorTests;
+
+// ========================================================================= //
+// -------------------------- Test Helpers --------------------------------- //
+// ========================================================================= //
+
+class TestReporter : public benchmark::BenchmarkReporter {
+public:
+  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
+      : reporters_(reps)  {}
+
+  virtual bool ReportContext(const Context& context) {
+    bool last_ret = false;
+    bool first = true;
+    for (auto rep : reporters_) {
+      bool new_ret = rep->ReportContext(context);
+      CHECK(first || new_ret == last_ret)
+          << "Reports return different values for ReportContext";
+      first = false;
+      last_ret = new_ret;
+    }
+    return last_ret;
+  }
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    for (auto rep : reporters_)
+      rep->ReportRuns(report);
+  }
+
+  virtual void Finalize() {
+      for (auto rep : reporters_)
+        rep->Finalize();
+  }
+
+private:
+  std::vector<benchmark::BenchmarkReporter*> reporters_;
+};
+
+
+#define CONCAT2(x, y) x##y
+#define CONCAT(x, y) CONCAT2(x, y)
+
+#define ADD_CASES(...) \
+    int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+
+int AddCases(std::vector<TestCase>* out, std::initializer_list<TestCase> const& v) {
+  for (auto const& TC : v)
+    out->push_back(TC);
+  return 0;
+}
+
+template <class First>
+std::string join(First f) { return f; }
+
+template <class First, class ...Args>
+std::string join(First f, Args&&... args) {
+    return std::string(std::move(f)) + "[ ]+" + join(std::forward<Args>(args)...);
+}
+
+std::string dec_re = "[0-9]+\\.[0-9]+";
+
+}  // end namespace
+
+// ========================================================================= //
+// ---------------------- Testing Prologue Output -------------------------- //
+// ========================================================================= //
+
+ADD_CASES(&ConsoleOutputTests, {
+    {join("^Benchmark", "Time", "CPU", "Iterations$"), MR_Next},
+    {"^[-]+$", MR_Next}
+});
+ADD_CASES(&CSVOutputTests, {
+  {"name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,"
+    "label,error_occurred,error_message"}
+});
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+void BM_basic(benchmark::State& state) {
+  while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_basic);
+
+ADD_CASES(&ConsoleOutputTests, {
+    {"^BM_basic[ ]+[0-9]{1,5} ns[ ]+[0-9]{1,5} ns[ ]+[0-9]+$"}
+});
+ADD_CASES(&JSONOutputTests, {
+    {"\"name\": \"BM_basic\",$"},
+    {"\"iterations\": [0-9]+,$", MR_Next},
+    {"\"real_time\": [0-9]{1,5},$", MR_Next},
+    {"\"cpu_time\": [0-9]{1,5},$", MR_Next},
+    {"\"time_unit\": \"ns\"$", MR_Next},
+    {"}", MR_Next}
+});
+ADD_CASES(&CSVOutputTests, {
+    {"^\"BM_basic\",[0-9]+," + dec_re + "," + dec_re + ",ns,,,,,$"}
+});
+
+// ========================================================================= //
+// ------------------------ Testing Error Output --------------------------- //
+// ========================================================================= //
+
+void BM_error(benchmark::State& state) {
+    state.SkipWithError("message");
+    while(state.KeepRunning()) {}
+}
+BENCHMARK(BM_error);
+ADD_CASES(&ConsoleOutputTests, {
+    {"^BM_error[ ]+ERROR OCCURRED: 'message'$"}
+});
+ADD_CASES(&JSONOutputTests, {
+    {"\"name\": \"BM_error\",$"},
+    {"\"error_occurred\": true,$", MR_Next},
+    {"\"error_message\": \"message\",$", MR_Next}
+});
+
+ADD_CASES(&CSVOutputTests, {
+    {"^\"BM_error\",,,,,,,,true,\"message\"$"}
+});
+
+
+// ========================================================================= //
+// ----------------------- Testing Complexity Output ----------------------- //
+// ========================================================================= //
+
+void BM_Complexity_O1(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.SetComplexityN(state.range_x());
+}
+BENCHMARK(BM_Complexity_O1)->Range(1, 1<<18)->Complexity(benchmark::o1);
+
+std::string bigOStr = "[0-9]+\\.[0-9]+ \\([0-9]+\\)";
+
+ADD_CASES(&ConsoleOutputTests, {
+   {join("^BM_Complexity_O1_BigO", bigOStr, bigOStr) + "[ ]*$"},
+   {join("^BM_Complexity_O1_RMS", "[0-9]+ %", "[0-9]+ %") + "[ ]*$"}
+});
+
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+
+int main(int argc, char* argv[]) {
+  // Add --color_print=false to argv since we don't want to match color codes.
+  char new_arg[64];
+  char* new_argv[64];
+  std::copy(argv, argv + argc, new_argv);
+  new_argv[argc++] = std::strcpy(new_arg, "--color_print=false");
+  benchmark::Initialize(&argc, new_argv);
+
+  benchmark::ConsoleReporter CR;
+  benchmark::JSONReporter JR;
+  benchmark::CSVReporter CSVR;
+  struct ReporterTest {
+    const char* name;
+    std::vector<TestCase>& output_cases;
+    std::vector<TestCase>& error_cases;
+    benchmark::BenchmarkReporter& reporter;
+    std::stringstream out_stream;
+    std::stringstream err_stream;
+
+    ReporterTest(const char* n,
+                 std::vector<TestCase>& out_tc,
+                 std::vector<TestCase>& err_tc,
+                 benchmark::BenchmarkReporter& br)
+        : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
+        reporter.SetOutputStream(&out_stream);
+        reporter.SetErrorStream(&err_stream);
+    }
+  } TestCases[] = {
+      {"ConsoleReporter", ConsoleOutputTests, ConsoleErrorTests, CR},
+      {"JSONReporter", JSONOutputTests, JSONErrorTests, JR},
+      {"CSVReporter", CSVOutputTests, CSVErrorTests, CSVR}
+  };
+
+  // Create the test reporter and run the benchmarks.
+  std::cout << "Running benchmarks...\n";
+  TestReporter test_rep({&CR, &JR, &CSVR});
+  benchmark::RunSpecifiedBenchmarks(&test_rep);
+
+  for (auto& rep_test : TestCases) {
+      std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+      std::string banner(msg.size() - 1, '-');
+      std::cout << banner << msg << banner << "\n";
+
+      std::cerr << rep_test.err_stream.str();
+      std::cout << rep_test.out_stream.str();
+
+      for (const auto& TC : rep_test.error_cases)
+        TC.Check(rep_test.err_stream);
+      for (const auto& TC : rep_test.output_cases)
+        TC.Check(rep_test.out_stream);
+
+      std::cout << "\n";
+  }
+  return 0;
+}
Index: utils/google-benchmark/test/skip_with_error_test.cc
===================================================================
--- /dev/null
+++ utils/google-benchmark/test/skip_with_error_test.cc
@@ -0,0 +1,161 @@
+
+#undef NDEBUG
+#include "benchmark/benchmark.h"
+#include "../src/check.h" // NOTE: check.h is for internal use only!
+#include <cassert>
+#include <vector>
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    all_runs_.insert(all_runs_.end(), begin(report), end(report));
+    ConsoleReporter::ReportRuns(report);
+  }
+
+  TestReporter()  {}
+  virtual ~TestReporter() {}
+
+  mutable std::vector<Run> all_runs_;
+};
+
+struct TestCase {
+  std::string name;
+  bool error_occurred;
+  std::string error_message;
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+
+  void CheckRun(Run const& run) const {
+    CHECK(name == run.benchmark_name) << "expected " << name << " got " << run.benchmark_name;
+    CHECK(error_occurred == run.error_occurred);
+    CHECK(error_message == run.error_message);
+    if (error_occurred) {
+      //CHECK(run.iterations == 0);
+    } else {
+      CHECK(run.iterations != 0);
+    }
+  }
+};
+
+std::vector<TestCase> ExpectedResults;
+
+int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
+  for (auto TC : v) {
+    TC.name = base_name + TC.name;
+    ExpectedResults.push_back(std::move(TC));
+  }
+  return 0;
+}
+
+#define CONCAT(x, y) CONCAT2(x, y)
+#define CONCAT2(x, y) x##y
+#define ADD_CASES(...) \
+int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+
+}  // end namespace
+
+
+void BM_error_before_running(benchmark::State& state) {
+  state.SkipWithError("error message");
+  while (state.KeepRunning()) {
+    assert(false);
+  }
+}
+BENCHMARK(BM_error_before_running);
+ADD_CASES("BM_error_before_running",
+          {{"", true, "error message"}});
+
+void BM_error_during_running(benchmark::State& state) {
+  int first_iter = true;
+  while (state.KeepRunning()) {
+    if (state.range_x() == 1 && state.thread_index <= (state.threads / 2)) {
+      assert(first_iter);
+      first_iter = false;
+      state.SkipWithError("error message");
+    } else {
+      state.PauseTiming();
+      state.ResumeTiming();
+    }
+  }
+}
+BENCHMARK(BM_error_during_running)->Arg(1)->Arg(2)->ThreadRange(1, 8);
+ADD_CASES(
+    "BM_error_during_running",
+    {{"/1/threads:1", true, "error message"},
+    {"/1/threads:2", true, "error message"},
+    {"/1/threads:4", true, "error message"},
+    {"/1/threads:8", true, "error message"},
+    {"/2/threads:1", false, ""},
+    {"/2/threads:2", false, ""},
+    {"/2/threads:4", false, ""},
+    {"/2/threads:8", false, ""}}
+);
+
+void BM_error_after_running(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  if (state.thread_index <= (state.threads / 2))
+    state.SkipWithError("error message");
+}
+BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
+ADD_CASES(
+    "BM_error_after_running",
+    {{"/threads:1", true, "error message"},
+    {"/threads:2", true, "error message"},
+    {"/threads:4", true, "error message"},
+    {"/threads:8", true, "error message"}}
+);
+
+void BM_error_while_paused(benchmark::State& state) {
+  bool first_iter = true;
+  while (state.KeepRunning()) {
+    if (state.range_x() == 1 && state.thread_index <= (state.threads / 2)) {
+      assert(first_iter);
+      first_iter = false;
+      state.PauseTiming();
+      state.SkipWithError("error message");
+    } else {
+      state.PauseTiming();
+      state.ResumeTiming();
+    }
+  }
+}
+BENCHMARK(BM_error_while_paused)->Arg(1)->Arg(2)->ThreadRange(1, 8);
+ADD_CASES(
+    "BM_error_while_paused",
+    {{"/1/threads:1", true, "error message"},
+    {"/1/threads:2", true, "error message"},
+    {"/1/threads:4", true, "error message"},
+    {"/1/threads:8", true, "error message"},
+    {"/2/threads:1", false, ""},
+    {"/2/threads:2", false, ""},
+    {"/2/threads:4", false, ""},
+    {"/2/threads:8", false, ""}}
+);
+
+
+int main(int argc, char* argv[]) {
+  benchmark::Initialize(&argc, argv);
+
+  TestReporter test_reporter;
+  benchmark::RunSpecifiedBenchmarks(&test_reporter);
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+  auto EB = ExpectedResults.begin();
+
+  for (Run const& run : test_reporter.all_runs_) {
+    assert(EB != ExpectedResults.end());
+    EB->CheckRun(run);
+    ++EB;
+  }
+  assert(EB == ExpectedResults.end());
+
+  return 0;
+}