diff --git a/compiler-rt/test/hwasan/TestCases/stack-uas.c b/compiler-rt/test/hwasan/TestCases/stack-uas.c
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/stack-uas.c
@@ -0,0 +1,69 @@
+// Tests use-after-scope detection and reporting.
+// RUN: %clang_hwasan -mllvm -hwasan-use-after-scope -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -mllvm -hwasan-use-after-scope -g %s -o %t && not %env_hwasan_opts=symbolize=0 %run %t 2>&1 | FileCheck %s --check-prefix=NOSYM
+
+// RUN: %clang_hwasan -mllvm -hwasan-use-after-scope=false -g %s -o %t && %run %t 2>&1
+// Use after scope is turned off by default.
+// RUN: %clang_hwasan -g %s -o %t && %run %t 2>&1
+
+
+// RUN: %clang_hwasan -fexperimental-new-pass-manager -mllvm -hwasan-use-after-scope -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -fno-experimental-new-pass-manager -mllvm -hwasan-use-after-scope -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: stable-runtime
+
+// Stack histories currently are not recorded on x86.
+// XFAIL: x86_64
+
+void USE(void *x) { // pretend_to_do_something(void *x)
+  __asm__ __volatile__(""
+                       :
+                       : "r"(x)
+                       : "memory");
+}
+
+__attribute__((noinline)) void Unrelated1() {
+  int A[2];
+  USE(&A[0]);
+}
+__attribute__((noinline)) void Unrelated2() {
+  int BB[3];
+  USE(&BB[0]);
+}
+__attribute__((noinline)) void Unrelated3() {
+  int CCC[4];
+  USE(&CCC[0]);
+}
+
+__attribute__((noinline)) char buggy() {
+  char *volatile p;
+  {
+    char zzz[0x1000];
+    p = zzz;
+  }
+  return *p;
+}
+
+int main() {
+  Unrelated1();
+  Unrelated2();
+  Unrelated3();
+  char p = buggy();
+  return p;
+  // CHECK: READ of size 1 at
+  // CHECK: #0 {{.*}} in buggy{{.*}}stack-uas.c:[[@LINE-10]]
+  // CHECK: Cause: stack tag-mismatch
+  // CHECK: is located in stack of thread
+  // CHECK: Potentially referenced stack objects:
+  // CHECK-NEXT: zzz in buggy {{.*}}stack-uas.c:[[@LINE-17]]
+  // CHECK-NEXT: Memory tags around the buggy address
+
+  // NOSYM: Previously allocated frames:
+  // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
+  // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
+  // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
+  // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
+  // NOSYM-NEXT: Memory tags around the buggy address
+
+  // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in buggy
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-capture.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-capture.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-capture.cpp
@@ -0,0 +1,21 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope --std=c++11 -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+#include <functional>
+
+int main() {
+  std::function<int()> f;
+  {
+    int x = 0;
+    f = [&x]() __attribute__((noinline)) {
+      return x; // BOOM
+      // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+      // CHECK: #0 0x{{.*}} in {{.*}}use-after-scope-capture.cpp:[[@LINE-2]]
+      // CHECK: Cause: stack tag-mismatch
+    };
+  }
+  return f(); // BOOM
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-constructor-temp.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-constructor-temp.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-constructor-temp.cpp
@@ -0,0 +1,1616 @@
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O1 %s -o %t && \
+// RUN:     %run %t 2>&1
+
+// REQUIRES: aarch64-target-arch
+
+#include <stdlib.h>
+#include <vector>
+
+
+#include <math.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <cmath>
+#include <limits>
+#include <iostream>
+
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+
+/*
+struct NoInit{};
+
+class Vec2 {
+	public:
+
+    union {
+        struct { float x, y; };
+        struct { float s, t; };
+        struct { float r, g; };
+    };
+	explicit Vec2(NoInit) {}
+
+	Vec2() : x(0), y(0) {}
+	Vec2(float x, float y, float z) : x(x), y(y) {}
+	Vec2(const Vec2&) = default;
+	float& operator[](size_t n) {
+		return (&x)[n];
+	}
+};
+
+
+class Vec4 {
+	public:
+
+    union {
+        struct { float x, y, z, w; };
+        struct { float s, t, p, q; };
+        struct { float r, g, b, a; };
+        Vec2 xy;
+    };
+	explicit Vec4(NoInit) {}
+
+	Vec4() : x(0), y(0), z(0), w(0) {}
+	Vec4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
+	Vec4(const Vec4&) = default;
+	float& operator[](size_t n) {
+		return (&x)[n];
+	}
+};
+
+class Mat {
+	public:
+	Mat() = default;
+	explicit Mat(NoInit) : cols{Vec4(NoInit{}), Vec4(NoInit{}), Vec4(NoInit{}), Vec4(NoInit{})} {}
+	Vec4& operator[](size_t n) {
+		return cols[n];
+	}
+	Vec4 cols[4];
+};
+
+Mat transpose(Mat m) {
+    Mat result(NoInit{});
+    for (size_t col = 0; col < 4; ++col) {
+        for (size_t row = 0; row < 4; ++row) {
+            result[col][row] = float(m[row][col]);
+        }
+    }
+    return result;
+}
+
+
+
+*/
+
+namespace details {
+#define PURE __attribute__((pure))
+#if __cplusplus >= 201402L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+template <template<typename T> class VECTOR, typename T>
+class TVecAddOperators {
+public:
+    /* compound assignment from a another vector of the same size but different
+     * element type.
+     */
+    template<typename OTHER>
+    VECTOR<T>& operator +=(const VECTOR<OTHER>& v) {
+        VECTOR<T>& lhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < lhs.size(); i++) {
+            lhs[i] += v[i];
+        }
+        return lhs;
+    }
+    template<typename OTHER>
+    VECTOR<T>& operator -=(const VECTOR<OTHER>& v) {
+        VECTOR<T>& lhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < lhs.size(); i++) {
+            lhs[i] -= v[i];
+        }
+        return lhs;
+    }
+
+    /* compound assignment from a another vector of the same type.
+     * These operators can be used for implicit conversion and  handle operations
+     * like "vector *= scalar" by letting the compiler implicitly convert a scalar
+     * to a vector (assuming the BASE<T> allows it).
+     */
+    VECTOR<T>& operator +=(const VECTOR<T>& v) {
+        VECTOR<T>& lhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < lhs.size(); i++) {
+            lhs[i] += v[i];
+        }
+        return lhs;
+    }
+    VECTOR<T>& operator -=(const VECTOR<T>& v) {
+        VECTOR<T>& lhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < lhs.size(); i++) {
+            lhs[i] -= v[i];
+        }
+        return lhs;
+    }
+
+    /*
+     * NOTE: the functions below ARE NOT member methods. They are friend functions
+     * with they definition inlined with their declaration. This makes these
+     * template functions available to the compiler when (and only when) this class
+     * is instantiated, at which point they're only templated on the 2nd parameter
+     * (the first one, BASE<T> being known).
+     */
+
+    /* The operators below handle operation between vectors of the same size
+     * but of a different element type.
+     */
+    template<typename RT>
+    friend inline constexpr VECTOR<T> PURE operator +(VECTOR<T> lv, const VECTOR<RT>& rv) {
+        // don't pass lv by reference because we need a copy anyways
+        return lv += rv;
+    }
+    template<typename RT>
+    friend inline constexpr VECTOR<T> PURE operator -(VECTOR<T> lv, const VECTOR<RT>& rv) {
+        // don't pass lv by reference because we need a copy anyways
+        return lv -= rv;
+    }
+
+    /* The operators below (which are not templates once this class is instanced,
+     * i.e.: BASE<T> is known) can be used for implicit conversion on both sides.
+     * These handle operations like "vector + scalar" and "scalar + vector" by
+     * letting the compiler implicitly convert a scalar to a vector (assuming
+     * the BASE<T> allows it).
+     */
+    friend inline constexpr VECTOR<T> PURE operator +(VECTOR<T> lv, const VECTOR<T>& rv) {
+        // don't pass lv by reference because we need a copy anyways
+        return lv += rv;
+    }
+    friend inline constexpr VECTOR<T> PURE operator -(VECTOR<T> lv, const VECTOR<T>& rv) {
+        // don't pass lv by reference because we need a copy anyways
+        return lv -= rv;
+    }
+};
+
+template<template<typename T> class VECTOR, typename T>
+class TVecProductOperators {
+public:
+    /* compound assignment from a another vector of the same size but different
+     * element type.
+     */
+    template<typename OTHER>
+    VECTOR<T>& operator *=(const VECTOR<OTHER>& v) {
+        VECTOR<T>& lhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < lhs.size(); i++) {
+            lhs[i] *= v[i];
+        }
+        return lhs;
+    }
+    template<typename OTHER>
+    VECTOR<T>& operator /=(const VECTOR<OTHER>& v) {
+        VECTOR<T>& lhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < lhs.size(); i++) {
+            lhs[i] /= v[i];
+        }
+        return lhs;
+    }
+
+    /* compound assignment from a another vector of the same type.
+     * These operators can be used for implicit conversion and  handle operations
+     * like "vector *= scalar" by letting the compiler implicitly convert a scalar
+     * to a vector (assuming the BASE<T> allows it).
+     */
+    VECTOR<T>& operator *=(const VECTOR<T>& v) {
+        VECTOR<T>& lhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < lhs.size(); i++) {
+            lhs[i] *= v[i];
+        }
+        return lhs;
+    }
+    VECTOR<T>& operator /=(const VECTOR<T>& v) {
+        VECTOR<T>& lhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < lhs.size(); i++) {
+            lhs[i] /= v[i];
+        }
+        return lhs;
+    }
+
+    /*
+     * NOTE: the functions below ARE NOT member methods. They are friend functions
+     * with they definition inlined with their declaration. This makes these
+     * template functions available to the compiler when (and only when) this class
+     * is instantiated, at which point they're only templated on the 2nd parameter
+     * (the first one, BASE<T> being known).
+     */
+
+    /* The operators below handle operation between vectors of the same size
+     * but of a different element type.
+     */
+    template<typename RT>
+    friend inline constexpr VECTOR<T> PURE operator *(VECTOR<T> lv, const VECTOR<RT>& rv) {
+        // don't pass lv by reference because we need a copy anyways
+        return lv *= rv;
+    }
+    template<typename RT>
+    friend inline constexpr VECTOR<T> PURE operator /(VECTOR<T> lv, const VECTOR<RT>& rv) {
+        // don't pass lv by reference because we need a copy anyways
+        return lv /= rv;
+    }
+
+    /* The operators below (which are not templates once this class is instanced,
+     * i.e.: BASE<T> is known) can be used for implicit conversion on both sides.
+     * These handle operations like "vector * scalar" and "scalar * vector" by
+     * letting the compiler implicitly convert a scalar to a vector (assuming
+     * the BASE<T> allows it).
+     */
+    friend inline constexpr VECTOR<T> PURE operator *(VECTOR<T> lv, const VECTOR<T>& rv) {
+        // don't pass lv by reference because we need a copy anyways
+        return lv *= rv;
+    }
+    friend inline constexpr VECTOR<T> PURE operator /(VECTOR<T> lv, const VECTOR<T>& rv) {
+        // don't pass lv by reference because we need a copy anyways
+        return lv /= rv;
+    }
+};
+
+/*
+ * TVecUnaryOperators implements unary operators on a vector of type BASE<T>.
+ *
+ * BASE only needs to implement operator[] and size().
+ * By simply inheriting from TVecUnaryOperators<BASE, T> BASE will automatically
+ * get all the functionality here.
+ *
+ * These operators are implemented as friend functions of TVecUnaryOperators<BASE, T>
+ */
+template<template<typename T> class VECTOR, typename T>
+class TVecUnaryOperators {
+public:
+    VECTOR<T>& operator ++() {
+        VECTOR<T>& rhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < rhs.size(); i++) {
+            ++rhs[i];
+        }
+        return rhs;
+    }
+
+    VECTOR<T>& operator --() {
+        VECTOR<T>& rhs = static_cast<VECTOR<T>&>(*this);
+        for (size_t i = 0; i < rhs.size(); i++) {
+            --rhs[i];
+        }
+        return rhs;
+    }
+
+    CONSTEXPR VECTOR<T> operator -() const {
+        VECTOR<T> r(VECTOR<T>::NO_INIT);
+        VECTOR<T> const& rv(static_cast<VECTOR<T> const&>(*this));
+        for (size_t i = 0; i < r.size(); i++) {
+            r[i] = -rv[i];
+        }
+        return r;
+    }
+};
+
+/*
+ * TVecComparisonOperators implements relational/comparison operators
+ * on a vector of type BASE<T>.
+ *
+ * BASE only needs to implement operator[] and size().
+ * By simply inheriting from TVecComparisonOperators<BASE, T> BASE will automatically
+ * get all the functionality here.
+ */
+template<template<typename T> class VECTOR, typename T>
+class TVecComparisonOperators {
+public:
+    /*
+     * NOTE: the functions below ARE NOT member methods. They are friend functions
+     * with they definition inlined with their declaration. This makes these
+     * template functions available to the compiler when (and only when) this class
+     * is instantiated, at which point they're only templated on the 2nd parameter
+     * (the first one, BASE<T> being known).
+     */
+    template<typename RT>
+    friend inline
+    bool PURE operator ==(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        for (size_t i = 0; i < lv.size(); i++)
+            if (lv[i] != rv[i])
+                return false;
+        return true;
+    }
+
+    template<typename RT>
+    friend inline
+    bool PURE operator !=(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        return !operator ==(lv, rv);
+    }
+
+    template<typename RT>
+    friend inline
+    bool PURE operator >(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        for (size_t i = 0; i < lv.size(); i++) {
+            if (lv[i] == rv[i]) {
+                continue;
+            }
+            return lv[i] > rv[i];
+        }
+        return false;
+    }
+
+    template<typename RT>
+    friend inline
+    constexpr bool PURE operator <=(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        return !(lv > rv);
+    }
+
+    template<typename RT>
+    friend inline
+    bool PURE operator <(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        for (size_t i = 0; i < lv.size(); i++) {
+            if (lv[i] == rv[i]) {
+                continue;
+            }
+            return lv[i] < rv[i];
+        }
+        return false;
+    }
+
+    template<typename RT>
+    friend inline
+    constexpr bool PURE operator >=(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        return !(lv < rv);
+    }
+
+    template<typename RT>
+    friend inline
+    CONSTEXPR VECTOR<bool> PURE equal(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        VECTOR<bool> r;
+        for (size_t i = 0; i < lv.size(); i++) {
+            r[i] = lv[i] == rv[i];
+        }
+        return r;
+    }
+
+    template<typename RT>
+    friend inline
+    CONSTEXPR VECTOR<bool> PURE notEqual(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        VECTOR<bool> r;
+        for (size_t i = 0; i < lv.size(); i++) {
+            r[i] = lv[i] != rv[i];
+        }
+        return r;
+    }
+
+    template<typename RT>
+    friend inline
+    CONSTEXPR VECTOR<bool> PURE lessThan(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        VECTOR<bool> r;
+        for (size_t i = 0; i < lv.size(); i++) {
+            r[i] = lv[i] < rv[i];
+        }
+        return r;
+    }
+
+    template<typename RT>
+    friend inline
+    CONSTEXPR VECTOR<bool> PURE lessThanEqual(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        VECTOR<bool> r;
+        for (size_t i = 0; i < lv.size(); i++) {
+            r[i] = lv[i] <= rv[i];
+        }
+        return r;
+    }
+
+    template<typename RT>
+    friend inline
+    CONSTEXPR VECTOR<bool> PURE greaterThan(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        VECTOR<bool> r;
+        for (size_t i = 0; i < lv.size(); i++) {
+            r[i] = lv[i] > rv[i];
+        }
+        return r;
+    }
+
+    template<typename RT>
+    friend inline
+    CONSTEXPR VECTOR<bool> PURE greaterThanEqual(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        VECTOR<bool> r;
+        for (size_t i = 0; i < lv.size(); i++) {
+            r[i] = lv[i] >= rv[i];
+        }
+        return r;
+    }
+};
+
+/*
+ * TVecFunctions implements functions on a vector of type BASE<T>.
+ *
+ * BASE only needs to implement operator[] and size().
+ * By simply inheriting from TVecFunctions<BASE, T> BASE will automatically
+ * get all the functionality here.
+ */
+template<template<typename T> class VECTOR, typename T>
+class TVecFunctions {
+public:
+    /*
+     * NOTE: the functions below ARE NOT member methods. They are friend functions
+     * with they definition inlined with their declaration. This makes these
+     * template functions available to the compiler when (and only when) this class
+     * is instantiated, at which point they're only templated on the 2nd parameter
+     * (the first one, BASE<T> being known).
+     */
+    template<typename RT>
+    friend inline CONSTEXPR T PURE dot(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        T r(0);
+        for (size_t i = 0; i < lv.size(); i++) {
+            //r = std::fma(lv[i], rv[i], r);
+            r += lv[i] * rv[i];
+        }
+        return r;
+    }
+
+    friend inline constexpr T PURE norm(const VECTOR<T>& lv) {
+        return std::sqrt(dot(lv, lv));
+    }
+
+    friend inline constexpr T PURE length(const VECTOR<T>& lv) {
+        return norm(lv);
+    }
+
+    friend inline constexpr T PURE norm2(const VECTOR<T>& lv) {
+        return dot(lv, lv);
+    }
+
+    friend inline constexpr T PURE length2(const VECTOR<T>& lv) {
+        return norm2(lv);
+    }
+
+    template<typename RT>
+    friend inline constexpr T PURE distance(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        return length(rv - lv);
+    }
+
+    template<typename RT>
+    friend inline constexpr T PURE distance2(const VECTOR<T>& lv, const VECTOR<RT>& rv) {
+        return length2(rv - lv);
+    }
+
+    friend inline constexpr VECTOR<T> PURE normalize(const VECTOR<T>& lv) {
+        return lv * (T(1) / length(lv));
+    }
+
+    friend inline constexpr VECTOR<T> PURE rcp(VECTOR<T> v) {
+        return T(1) / v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE abs(VECTOR<T> v) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = std::abs(v[i]);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE floor(VECTOR<T> v) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = std::floor(v[i]);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE ceil(VECTOR<T> v) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = std::ceil(v[i]);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE round(VECTOR<T> v) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = std::round(v[i]);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE inversesqrt(VECTOR<T> v) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = T(1) / std::sqrt(v[i]);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE sqrt(VECTOR<T> v) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = std::sqrt(v[i]);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE pow(VECTOR<T> v, T p) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = std::pow(v[i], p);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE saturate(const VECTOR<T>& lv) {
+        return clamp(lv, T(0), T(1));
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE clamp(VECTOR<T> v, T min, T max) {
+        for (size_t i = 0; i< v.size(); i++) {
+            v[i] = std::min(max, std::max(min, v[i]));
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE fma(const VECTOR<T>& lv, const VECTOR<T>& rv, VECTOR<T> a) {
+        for (size_t i = 0; i<lv.size(); i++) {
+            //a[i] = std::fma(lv[i], rv[i], a[i]);
+            a[i] += (lv[i] * rv[i]);
+        }
+        return a;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE min(const VECTOR<T>& u, VECTOR<T> v) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = std::min(u[i], v[i]);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE max(const VECTOR<T>& u, VECTOR<T> v) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = std::max(u[i], v[i]);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR T PURE max(const VECTOR<T>& v) {
+        T r(std::numeric_limits<T>::lowest());
+        for (size_t i = 0; i < v.size(); i++) {
+            r = std::max(r, v[i]);
+        }
+        return r;
+    }
+
+    friend inline CONSTEXPR T PURE min(const VECTOR<T>& v) {
+        T r(std::numeric_limits<T>::max());
+        for (size_t i = 0; i < v.size(); i++) {
+            r = std::min(r, v[i]);
+        }
+        return r;
+    }
+
+    friend inline CONSTEXPR VECTOR<T> PURE apply(VECTOR<T> v, const std::function<T(T)>& f) {
+        for (size_t i = 0; i < v.size(); i++) {
+            v[i] = f(v[i]);
+        }
+        return v;
+    }
+
+    friend inline CONSTEXPR bool PURE any(const VECTOR<T>& v) {
+        for (size_t i = 0; i < v.size(); i++) {
+            if (v[i] != T(0)) return true;
+        }
+        return false;
+    }
+
+    friend inline CONSTEXPR bool PURE all(const VECTOR<T>& v) {
+        bool result = true;
+        for (size_t i = 0; i < v.size(); i++) {
+            result &= (v[i] != T(0));
+        }
+        return result;
+    }
+
+    template<typename R>
+    friend inline CONSTEXPR VECTOR<R> PURE map(VECTOR<T> v, const std::function<R(T)>& f) {
+        VECTOR<R> result;
+        for (size_t i = 0; i < v.size(); i++) {
+            result[i] = f(v[i]);
+        }
+        return result;
+    }
+};
+
+/*
+ * TVecDebug implements functions on a vector of type BASE<T>.
+ *
+ * BASE only needs to implement operator[] and size().
+ * By simply inheriting from TVecDebug<BASE, T> BASE will automatically
+ * get all the functionality here.
+ */
+template<template<typename T> class VECTOR, typename T>
+class TVecDebug {
+public:
+    /*
+     * NOTE: the functions below ARE NOT member methods. They are friend functions
+     * with they definition inlined with their declaration. This makes these
+     * template functions available to the compiler when (and only when) this class
+     * is instantiated, at which point they're only templated on the 2nd parameter
+     * (the first one, BASE<T> being known).
+     */
+    friend std::ostream& operator<<(std::ostream& stream, const VECTOR<T>& v) {
+        stream << "< ";
+        for (size_t i = 0; i < v.size() - 1; i++) {
+            stream << T(v[i]) << ", ";
+        }
+        stream << T(v[v.size() - 1]) << " >";
+        return stream;
+    }
+};
+
+
+template <typename T>
+class TVec2 :   public TVecProductOperators<TVec2, T>,
+                public TVecAddOperators<TVec2, T>,
+                public TVecUnaryOperators<TVec2, T>,
+                public TVecComparisonOperators<TVec2, T>,
+                public TVecFunctions<TVec2, T>,
+                public TVecDebug<TVec2, T> {
+public:
+    enum no_init { NO_INIT };
+    typedef T value_type;
+    typedef T& reference;
+    typedef T const& const_reference;
+    typedef size_t size_type;
+
+    union {
+        struct { T x, y; };
+        struct { T s, t; };
+        struct { T r, g; };
+    };
+
+    static constexpr size_t SIZE = 2;
+    inline constexpr size_type size() const { return SIZE; }
+
+    // array access
+    inline constexpr T const& operator[](size_t i) const {
+#if __cplusplus >= 201402L
+        // only possible in C++0x14 with constexpr
+        assert(i < SIZE);
+#endif
+        return (&x)[i];
+    }
+
+    inline T& operator[](size_t i) {
+        assert(i < SIZE);
+        return (&x)[i];
+    }
+
+    // -----------------------------------------------------------------------
+    // we want the compiler generated versions for these...
+    TVec2(const TVec2&) = default;
+    ~TVec2() = default;
+    TVec2& operator = (const TVec2&) = default;
+
+    // constructors
+
+    // leaves object uninitialized. use with caution.
+    explicit
+    constexpr TVec2(no_init) { }
+
+    // default constructor
+    constexpr TVec2() : x(0), y(0) { }
+
+    // handles implicit conversion to a tvec4. must not be explicit.
+    template<typename A, typename = typename std::enable_if<std::is_arithmetic<A>::value >::type>
+    constexpr TVec2(A v) : x(v), y(v) { }
+
+    template<typename A, typename B>
+    constexpr TVec2(A x, B y) : x(static_cast<T>(x)), y(static_cast<T>(y)) { }
+
+    template<typename A>
+    explicit
+    constexpr TVec2(const TVec2<A>& v) : x(v.x), y(v.y) { }
+
+    // cross product works only on vectors of size 2 or 3
+    template<typename RT>
+    friend inline
+    constexpr value_type cross(const TVec2& u, const TVec2<RT>& v) {
+        return value_type(u.x*v.y - u.y*v.x);
+    }
+};
+
+
+template <typename T>
+class TVec3 :   public TVecProductOperators<TVec3, T>,
+                public TVecAddOperators<TVec3, T>,
+                public TVecUnaryOperators<TVec3, T>,
+                public TVecComparisonOperators<TVec3, T>,
+                public TVecFunctions<TVec3, T>,
+                public TVecDebug<TVec3, T> {
+public:
+    enum no_init { NO_INIT };
+    typedef T value_type;
+    typedef T& reference;
+    typedef T const& const_reference;
+    typedef size_t size_type;
+
+    union {
+        struct { T x, y, z; };
+        struct { T s, t, p; };
+        struct { T r, g, b; };
+        TVec2<T> xy;
+        TVec2<T> st;
+        TVec2<T> rg;
+    };
+
+    static constexpr size_t SIZE = 3;
+    inline constexpr size_type size() const { return SIZE; }
+
+    // array access
+    inline constexpr T const& operator[](size_t i) const {
+#if __cplusplus >= 201402L
+        // only possible in C++0x14 with constexpr
+        assert(i < SIZE);
+#endif
+        return (&x)[i];
+    }
+
+    inline T& operator[](size_t i) {
+        assert(i < SIZE);
+        return (&x)[i];
+    }
+
+    // -----------------------------------------------------------------------
+    // we want the compiler generated versions for these...
+    TVec3(const TVec3&) = default;
+    ~TVec3() = default;
+    TVec3& operator = (const TVec3&) = default;
+
+    // constructors
+    // leaves object uninitialized. use with caution.
+    explicit
+    constexpr TVec3(no_init) { }
+
+    // default constructor
+    constexpr TVec3() : x(0), y(0), z(0) { }
+
+    // handles implicit conversion to a tvec4. must not be explicit.
+    template<typename A, typename = typename std::enable_if<std::is_arithmetic<A>::value >::type>
+    constexpr TVec3(A v) : x(static_cast<T>(v)), y(static_cast<T>(v)), z(static_cast<T>(v)) { }
+
+    template<typename A, typename B, typename C>
+    constexpr TVec3(A x, B y, C z) : x(static_cast<T>(x)), y(static_cast<T>(y)), z(static_cast<T>(z)) { }
+
+    template<typename A, typename B>
+    constexpr TVec3(const TVec2<A>& v, B z) : x(v.x), y(v.y), z(static_cast<T>(z)) { }
+
+    template<typename A>
+    explicit
+    constexpr TVec3(const TVec3<A>& v) : x(v.x), y(v.y), z(v.z) { }
+
+    // cross product works only on vectors of size 3
+    template <typename RT>
+    friend inline
+    constexpr TVec3 cross(const TVec3& u, const TVec3<RT>& v) {
+        return TVec3(
+                u.y*v.z - u.z*v.y,
+                u.z*v.x - u.x*v.z,
+                u.x*v.y - u.y*v.x);
+    }
+};
+
+template <typename T>
+class  TVec4 :  public TVecProductOperators<TVec4, T>,
+                public TVecAddOperators<TVec4, T>,
+                public TVecUnaryOperators<TVec4, T>,
+                public TVecComparisonOperators<TVec4, T>,
+                public TVecFunctions<TVec4, T>,
+                public TVecDebug<TVec4, T> {
+public:
+    enum no_init { NO_INIT };
+    typedef T value_type;
+    typedef T& reference;
+    typedef T const& const_reference;
+    typedef size_t size_type;
+
+    union {
+        struct { T x, y, z, w; };
+        struct { T s, t, p, q; };
+        struct { T r, g, b, a; };
+        TVec2<T> xy;
+        TVec2<T> st;
+        TVec2<T> rg;
+        TVec3<T> xyz;
+        TVec3<T> stp;
+        TVec3<T> rgb;
+    };
+
+    static constexpr size_t SIZE = 4;
+    inline constexpr size_type size() const { return SIZE; }
+
+    // array access
+    inline constexpr T const& operator[](size_t i) const {
+#if __cplusplus >= 201402L
+        // only possible in C++0x14 with constexpr
+        assert(i < SIZE);
+#endif
+        return (&x)[i];
+    }
+
+    inline T& operator[](size_t i) {
+        assert(i < SIZE);
+        return (&x)[i];
+    }
+
+    // -----------------------------------------------------------------------
+    // we want the compiler generated versions for these...
+    TVec4(const TVec4&) = default;
+    ~TVec4() = default;
+    TVec4& operator = (const TVec4&) = default;
+
+    // constructors
+
+    // leaves object uninitialized. use with caution.
+    explicit
+    constexpr TVec4(no_init) { }
+
+    // default constructor
+    constexpr TVec4() : x(0), y(0), z(0), w(0) { }
+
+    // handles implicit conversion to a tvec4. must not be explicit.
+    template<typename A, typename = typename std::enable_if<std::is_arithmetic<A>::value >::type>
+    constexpr TVec4(A v) : x(v), y(v), z(v), w(v) { }
+
+    template<typename A, typename B, typename C, typename D>
+    constexpr TVec4(A x, B y, C z, D w) : x(x), y(y), z(z), w(w) { }
+
+    template<typename A, typename B, typename C>
+    constexpr TVec4(const TVec2<A>& v, B z, C w) : x(v.x), y(v.y), z(z), w(w) { }
+
+    template<typename A, typename B>
+    constexpr TVec4(const TVec3<A>& v, B w) : x(v.x), y(v.y), z(v.z), w(w) { }
+
+    template<typename A>
+    explicit
+    constexpr TVec4(const TVec4<A>& v) : x(v.x), y(v.y), z(v.z), w(v.w) { }
+};
+
+
+namespace matrix {
+
+CONSTEXPR float PURE transpose(float x) { return x; }
+// ----------------------------------------------------------------------------------------
+
+/* FIXME: this should go into TMatSquareFunctions<> but for some reason
+ * BASE<T>::col_type is not accessible from there (???)
+ */
+
+// transpose. this handles matrices of matrices
+template <typename MATRIX>
+CONSTEXPR MATRIX PURE transpose(const MATRIX& m) {
+    // for now we only handle square matrix transpose
+    static_assert(MATRIX::NUM_COLS == MATRIX::NUM_ROWS, "transpose only supports square matrices");
+    MATRIX result(MATRIX::NO_INIT);
+    for (size_t col = 0; col < MATRIX::NUM_COLS; ++col) {
+        for (size_t row = 0; row < MATRIX::NUM_ROWS; ++row) {
+            result[col][row] = transpose(m[row][col]);
+        }
+    }
+    return result;
+}
+
+
+template<typename MATRIX_R, typename MATRIX_A, typename MATRIX_B>
+CONSTEXPR MATRIX_R PURE multiply(const MATRIX_A& lhs, const MATRIX_B& rhs) {
+    // pre-requisite:
+    //  lhs : D columns, R rows
+    //  rhs : C columns, D rows
+    //  res : C columns, R rows
+
+    static_assert(MATRIX_A::NUM_COLS == MATRIX_B::NUM_ROWS,
+            "matrices can't be multiplied. invalid dimensions.");
+    static_assert(MATRIX_R::NUM_COLS == MATRIX_B::NUM_COLS,
+            "invalid dimension of matrix multiply result.");
+    static_assert(MATRIX_R::NUM_ROWS == MATRIX_A::NUM_ROWS,
+            "invalid dimension of matrix multiply result.");
+
+    MATRIX_R res(MATRIX_R::NO_INIT);
+    for (size_t col = 0; col < MATRIX_R::NUM_COLS; ++col) {
+        res[col] = lhs * rhs[col];
+    }
+    return res;
+}
+
+// trace. this handles matrices of matrices
+template <typename MATRIX>
+CONSTEXPR typename MATRIX::value_type PURE trace(const MATRIX& m) {
+    static_assert(MATRIX::NUM_COLS == MATRIX::NUM_ROWS, "trace only defined for square matrices");
+    typename MATRIX::value_type result(0);
+    for (size_t col = 0; col < MATRIX::NUM_COLS; ++col) {
+        result += trace(m[col][col]);
+    }
+    return result;
+}
+
+// diag. this handles matrices of matrices
+template <typename MATRIX>
+CONSTEXPR typename MATRIX::col_type PURE diag(const MATRIX& m) {
+    static_assert(MATRIX::NUM_COLS == MATRIX::NUM_ROWS, "diag only defined for square matrices");
+    typename MATRIX::col_type result(MATRIX::col_type::NO_INIT);
+    for (size_t col = 0; col < MATRIX::NUM_COLS; ++col) {
+        result[col] = m[col][col];
+    }
+    return result;
+}
+
+}
+
+/*
+ * TMatSquareFunctions implements functions on a matrix of type BASE<T>.
+ *
+ * BASE only needs to implement:
+ *  - operator[]
+ *  - col_type
+ *  - row_type
+ *  - COL_SIZE
+ *  - ROW_SIZE
+ *
+ * By simply inheriting from TMatSquareFunctions<BASE, T> BASE will automatically
+ * get all the functionality here.
+ */
+
+
+template<template<typename U> class BASE, typename T>
+class TMatSquareFunctions {
+public:
+
+    /*
+     * NOTE: the functions below ARE NOT member methods. They are friend functions
+     * with they definition inlined with their declaration. This makes these
+     * template functions available to the compiler when (and only when) this class
+     * is instantiated, at which point they're only templated on the 2nd parameter
+     * (the first one, BASE<T> being known).
+     */
+    friend inline constexpr BASE<T> PURE transpose(const BASE<T>& m) {
+        return matrix::transpose(m);
+    }
+    friend inline constexpr T PURE trace(const BASE<T>& m) {
+        return matrix::trace(m);
+    }
+};
+
+template <template<typename T> class BASE, typename T>
+class TMatProductOperators {
+public:
+    // multiply by a scalar
+    BASE<T>& operator *= (T v) {
+        BASE<T>& lhs(static_cast< BASE<T>& >(*this));
+        for (size_t col = 0; col < BASE<T>::NUM_COLS; ++col) {
+            lhs[col] *= v;
+        }
+        return lhs;
+    }
+
+    //  matrix *= matrix
+    template<typename U>
+    const BASE<T>& operator *= (const BASE<U>& rhs) {
+        BASE<T>& lhs(static_cast< BASE<T>& >(*this));
+        lhs = matrix::multiply<BASE<T> >(lhs, rhs);
+        return lhs;
+    }
+
+    // divide by a scalar
+    BASE<T>& operator /= (T v) {
+        BASE<T>& lhs(static_cast< BASE<T>& >(*this));
+        for (size_t col = 0; col < BASE<T>::NUM_COLS; ++col) {
+            lhs[col] /= v;
+        }
+        return lhs;
+    }
+
+    // matrix * matrix, result is a matrix of the same type than the lhs matrix
+    template<typename U>
+    friend CONSTEXPR BASE<T> PURE operator *(const BASE<T>& lhs, const BASE<U>& rhs) {
+        return matrix::multiply<BASE<T> >(lhs, rhs);
+    }
+};
+
+template<template<typename U> class BASE, typename T>
+class TMatHelpers {
+public:
+    constexpr inline size_t getColumnSize() const   { return BASE<T>::COL_SIZE; }
+    constexpr inline size_t getRowSize() const      { return BASE<T>::ROW_SIZE; }
+    constexpr inline size_t getColumnCount() const  { return BASE<T>::NUM_COLS; }
+    constexpr inline size_t getRowCount() const     { return BASE<T>::NUM_ROWS; }
+    constexpr inline size_t size()  const           { return BASE<T>::ROW_SIZE; }  // for TVec*<>
+
+    // array access
+    constexpr T const* asArray() const {
+        return &static_cast<BASE<T> const &>(*this)[0][0];
+    }
+
+    // element access
+    inline constexpr T const& operator()(size_t row, size_t col) const {
+        return static_cast<BASE<T> const &>(*this)[col][row];
+    }
+
+    inline T& operator()(size_t row, size_t col) {
+        return static_cast<BASE<T>&>(*this)[col][row];
+    }
+
+    template <typename VEC>
+    static CONSTEXPR BASE<T> translate(const VEC& t) {
+        BASE<T> r;
+        r[BASE<T>::NUM_COLS-1] = t;
+        return r;
+    }
+
+    template <typename VEC>
+    static constexpr BASE<T> scale(const VEC& s) {
+        return BASE<T>(s);
+    }
+
+    friend inline CONSTEXPR BASE<T> PURE abs(BASE<T> m) {
+        for (size_t col = 0; col < BASE<T>::NUM_COLS; ++col) {
+            m[col] = abs(m[col]);
+        }
+        return m;
+    }
+};
+
+// functions for 3x3 and 4x4 matrices
+template<template<typename U> class BASE, typename T>
+class TMatTransform {
+public:
+    inline constexpr TMatTransform() {
+        static_assert(BASE<T>::NUM_ROWS == 3 || BASE<T>::NUM_ROWS == 4, "3x3 or 4x4 matrices only");
+    }
+
+    template <typename A, typename VEC>
+    static CONSTEXPR BASE<T> rotate(A radian, const VEC& about) {
+        BASE<T> r;
+        T c = std::cos(radian);
+        T s = std::sin(radian);
+        if (about.x == 1 && about.y == 0 && about.z == 0) {
+            r[1][1] = c;   r[2][2] = c;
+            r[1][2] = s;   r[2][1] = -s;
+        } else if (about.x == 0 && about.y == 1 && about.z == 0) {
+            r[0][0] = c;   r[2][2] = c;
+            r[2][0] = s;   r[0][2] = -s;
+        } else if (about.x == 0 && about.y == 0 && about.z == 1) {
+            r[0][0] = c;   r[1][1] = c;
+            r[0][1] = s;   r[1][0] = -s;
+        } else {
+            VEC nabout = normalize(about);
+            typename VEC::value_type x = nabout.x;
+            typename VEC::value_type y = nabout.y;
+            typename VEC::value_type z = nabout.z;
+            T nc = 1 - c;
+            T xy = x * y;
+            T yz = y * z;
+            T zx = z * x;
+            T xs = x * s;
+            T ys = y * s;
+            T zs = z * s;
+            r[0][0] = x*x*nc +  c;    r[1][0] =  xy*nc - zs;    r[2][0] =  zx*nc + ys;
+            r[0][1] =  xy*nc + zs;    r[1][1] = y*y*nc +  c;    r[2][1] =  yz*nc - xs;
+            r[0][2] =  zx*nc - ys;    r[1][2] =  yz*nc + xs;    r[2][2] = z*z*nc +  c;
+
+            // Clamp results to -1, 1.
+            for (size_t col = 0; col < 3; ++col) {
+                for (size_t row = 0; row < 3; ++row) {
+                    r[col][row] = std::min(std::max(r[col][row], T(-1)), T(1));
+                }
+            }
+        }
+        return r;
+    }
+
+    /**
+     * Create a matrix from euler angles using YPR around YXZ respectively
+     * @param yaw about Y axis
+     * @param pitch about X axis
+     * @param roll about Z axis
+     */
+    template <
+        typename Y, typename P, typename R,
+        typename = typename std::enable_if<std::is_arithmetic<Y>::value >::type,
+        typename = typename std::enable_if<std::is_arithmetic<P>::value >::type,
+        typename = typename std::enable_if<std::is_arithmetic<R>::value >::type
+    >
+    static CONSTEXPR BASE<T> eulerYXZ(Y yaw, P pitch, R roll) {
+        return eulerZYX(roll, pitch, yaw);
+    }
+
+    /**
+     * Create a matrix from euler angles using YPR around ZYX respectively
+     * @param roll about X axis
+     * @param pitch about Y axis
+     * @param yaw about Z axis
+     *
+     * The euler angles are applied in ZYX order. i.e: a vector is first rotated
+     * about X (roll) then Y (pitch) and then Z (yaw).
+     */
+    template <
+    typename Y, typename P, typename R,
+    typename = typename std::enable_if<std::is_arithmetic<Y>::value >::type,
+    typename = typename std::enable_if<std::is_arithmetic<P>::value >::type,
+    typename = typename std::enable_if<std::is_arithmetic<R>::value >::type
+    >
+    static CONSTEXPR BASE<T> eulerZYX(Y yaw, P pitch, R roll) {
+        BASE<T> r;
+        T cy = std::cos(yaw);
+        T sy = std::sin(yaw);
+        T cp = std::cos(pitch);
+        T sp = std::sin(pitch);
+        T cr = std::cos(roll);
+        T sr = std::sin(roll);
+        T cc = cr * cy;
+        T cs = cr * sy;
+        T sc = sr * cy;
+        T ss = sr * sy;
+        r[0][0] = cp * cy;
+        r[0][1] = cp * sy;
+        r[0][2] = -sp;
+        r[1][0] = sp * sc - cs;
+        r[1][1] = sp * ss + cc;
+        r[1][2] = cp * sr;
+        r[2][0] = sp * cc + ss;
+        r[2][1] = sp * cs - sc;
+        r[2][2] = cp * cr;
+
+        // Clamp results to -1, 1.
+        for (size_t col = 0; col < 3; ++col) {
+            for (size_t row = 0; row < 3; ++row) {
+                r[col][row] = std::min(std::max(r[col][row], T(-1)), T(1));
+            }
+        }
+        return r;
+    }
+};
+
+
+template <template<typename T> class BASE, typename T>
+class TMatDebug {
+public:
+    friend std::ostream& operator<<(std::ostream& stream, const BASE<T>& m) {
+        for (size_t row = 0; row < BASE<T>::NUM_ROWS; ++row) {
+            if (row != 0) {
+                stream << std::endl;
+            }
+            if (row == 0) {
+                stream << "/ ";
+            } else if (row == BASE<T>::NUM_ROWS-1) {
+                stream << "\\ ";
+            } else {
+                stream << "| ";
+            }
+            for (size_t col = 0; col < BASE<T>::NUM_COLS; ++col) {
+                stream << std::setw(10) << std::to_string(m[col][row]);
+            }
+            if (row == 0) {
+                stream << " \\";
+            } else if (row == BASE<T>::NUM_ROWS-1) {
+                stream << " /";
+            } else {
+                stream << " |";
+            }
+        }
+        return stream;
+    }
+};
+
+template <typename T>
+class TMat44 :  public TVecUnaryOperators<TMat44, T>,
+                public TVecComparisonOperators<TMat44, T>,
+                public TVecAddOperators<TMat44, T>,
+                public TMatProductOperators<TMat44, T>,
+                public TMatSquareFunctions<TMat44, T>,
+                public TMatTransform<TMat44, T>,
+                public TMatHelpers<TMat44, T>,
+                public TMatDebug<TMat44, T> {
+public:
+    enum no_init { NO_INIT };
+    typedef T value_type;
+    typedef T& reference;
+    typedef T const& const_reference;
+    typedef size_t size_type;
+    typedef TVec4<T> col_type;
+    typedef TVec4<T> row_type;
+
+    static constexpr size_t COL_SIZE = col_type::SIZE;  // size of a column (i.e.: number of rows)
+    static constexpr size_t ROW_SIZE = row_type::SIZE;  // size of a row (i.e.: number of columns)
+    static constexpr size_t NUM_ROWS = COL_SIZE;
+    static constexpr size_t NUM_COLS = ROW_SIZE;
+
+private:
+    /*
+     *  <--  N columns  -->
+     *
+     *  a[0][0] a[1][0] a[2][0] ... a[N][0]    ^
+     *  a[0][1] a[1][1] a[2][1] ... a[N][1]    |
+     *  a[0][2] a[1][2] a[2][2] ... a[N][2]  M rows
+     *  ...                                    |
+     *  a[0][M] a[1][M] a[2][M] ... a[N][M]    v
+     *
+     *  COL_SIZE = M
+     *  ROW_SIZE = N
+     *  m[0] = [ a[0][0] a[0][1] a[0][2] ... a[0][M] ]
+     */
+
+    col_type m_value[NUM_COLS];
+
+public:
+    // array access
+    inline constexpr col_type const& operator[](size_t column) const {
+#if __cplusplus >= 201402L
+        // only possible in C++0x14 with constexpr
+        assert(column < NUM_COLS);
+#endif
+        return m_value[column];
+    }
+
+    inline col_type& operator[](size_t column) {
+        assert(column < NUM_COLS);
+        return m_value[column];
+    }
+
+    // -----------------------------------------------------------------------
+    // we want the compiler generated versions for these...
+    TMat44(const TMat44&) = default;
+    ~TMat44() = default;
+    TMat44& operator = (const TMat44&) = default;
+
+    /*
+     *  constructors
+     */
+
+    // leaves object uninitialized. use with caution.
+    explicit constexpr TMat44(no_init)
+            : m_value{ col_type(col_type::NO_INIT),
+                       col_type(col_type::NO_INIT),
+                       col_type(col_type::NO_INIT),
+                       col_type(col_type::NO_INIT) } {}
+
+    /** initialize to identity.
+     *
+     *      \f$
+     *      \left(
+     *      \begin{array}{cccc}
+     *      1 & 0 & 0 & 0 \\
+     *      0 & 1 & 0 & 0 \\
+     *      0 & 0 & 1 & 0 \\
+     *      0 & 0 & 0 & 1 \\
+     *      \end{array}
+     *      \right)
+     *      \f$
+     */
+    CONSTEXPR TMat44();
+
+    /** initialize to Identity*scalar.
+     *
+     *      \f$
+     *      \left(
+     *      \begin{array}{cccc}
+     *      v & 0 & 0 & 0 \\
+     *      0 & v & 0 & 0 \\
+     *      0 & 0 & v & 0 \\
+     *      0 & 0 & 0 & v \\
+     *      \end{array}
+     *      \right)
+     *      \f$
+     */
+    template<typename U>
+    explicit CONSTEXPR TMat44(U v);
+
+    /** sets the diagonal to a vector.
+     *
+     *      \f$
+     *      \left(
+     *      \begin{array}{cccc}
+     *      v[0] & 0 & 0 & 0 \\
+     *      0 & v[1] & 0 & 0 \\
+     *      0 & 0 & v[2] & 0 \\
+     *      0 & 0 & 0 & v[3] \\
+     *      \end{array}
+     *      \right)
+     *      \f$
+     */
+    template <typename U>
+    explicit CONSTEXPR TMat44(const TVec4<U>& v);
+
+    // construct from another matrix of the same size
+    template <typename U>
+    explicit CONSTEXPR TMat44(const TMat44<U>& rhs);
+
+    /** construct from 4 column vectors.
+     *
+     *      \f$
+     *      \left(
+     *      \begin{array}{cccc}
+     *      v0 & v1 & v2 & v3 \\
+     *      \end{array}
+     *      \right)
+     *      \f$
+     */
+    template <typename A, typename B, typename C, typename D>
+    CONSTEXPR TMat44(const TVec4<A>& v0, const TVec4<B>& v1, const TVec4<C>& v2, const TVec4<D>& v3);
+
+    /** construct from 16 elements in column-major form.
+     *
+     *      \f$
+     *      \left(
+     *      \begin{array}{cccc}
+     *      m[0][0] & m[1][0] & m[2][0] & m[3][0] \\
+     *      m[0][1] & m[1][1] & m[2][1] & m[3][1] \\
+     *      m[0][2] & m[1][2] & m[2][2] & m[3][2] \\
+     *      m[0][3] & m[1][3] & m[2][3] & m[3][3] \\
+     *      \end{array}
+     *      \right)
+     *      \f$
+     */
+    template <
+        typename A, typename B, typename C, typename D,
+        typename E, typename F, typename G, typename H,
+        typename I, typename J, typename K, typename L,
+        typename M, typename N, typename O, typename P>
+    CONSTEXPR TMat44(
+            A m00, B m01, C m02, D m03,
+            E m10, F m11, G m12, H m13,
+            I m20, J m21, K m22, L m23,
+            M m30, N m31, O m32, P m33);
+
+
+    /**
+     * construct from a C array in column major form.
+     */
+    template <typename U>
+    explicit CONSTEXPR TMat44(U const* rawArray);
+
+    /*
+     *  helpers
+     */
+
+    static CONSTEXPR TMat44 ortho(T left, T right, T bottom, T top, T near, T far);
+
+    static CONSTEXPR TMat44 frustum(T left, T right, T bottom, T top, T near, T far);
+
+    enum class Fov {
+        HORIZONTAL,
+        VERTICAL
+    };
+    static CONSTEXPR TMat44 perspective(T fov, T aspect, T near, T far, Fov direction = Fov::VERTICAL);
+
+    template <typename A, typename B, typename C>
+    static CONSTEXPR TMat44 lookAt(const TVec3<A>& eye, const TVec3<B>& center, const TVec3<C>& up);
+
+    template <typename A>
+    static CONSTEXPR TVec3<A> project(const TMat44& projectionMatrix, TVec3<A> vertice) {
+        TVec4<A> r = projectionMatrix * TVec4<A>{ vertice, 1 };
+        return r.xyz / r.w;
+    }
+
+    template <typename A>
+    static CONSTEXPR TVec4<A> project(const TMat44& projectionMatrix, TVec4<A> vertice) {
+        vertice = projectionMatrix * vertice;
+        return { vertice.xyz / vertice.w, 1 };
+    }
+};
+
+// ----------------------------------------------------------------------------------------
+// Constructors
+// ----------------------------------------------------------------------------------------
+
+// Since the matrix code could become pretty big quickly, we don't inline most
+// operations.
+
+template <typename T>
+CONSTEXPR TMat44<T>::TMat44() {
+    m_value[0] = col_type(1, 0, 0, 0);
+    m_value[1] = col_type(0, 1, 0, 0);
+    m_value[2] = col_type(0, 0, 1, 0);
+    m_value[3] = col_type(0, 0, 0, 1);
+}
+
+template <typename T>
+template <typename U>
+CONSTEXPR TMat44<T>::TMat44(U v) {
+    m_value[0] = col_type(v, 0, 0, 0);
+    m_value[1] = col_type(0, v, 0, 0);
+    m_value[2] = col_type(0, 0, v, 0);
+    m_value[3] = col_type(0, 0, 0, v);
+}
+
+template<typename T>
+template<typename U>
+CONSTEXPR TMat44<T>::TMat44(const TVec4<U>& v) {
+    m_value[0] = col_type(v.x, 0, 0, 0);
+    m_value[1] = col_type(0, v.y, 0, 0);
+    m_value[2] = col_type(0, 0, v.z, 0);
+    m_value[3] = col_type(0, 0, 0, v.w);
+}
+
+// construct from 16 scalars
+template<typename T>
+template <
+    typename A, typename B, typename C, typename D,
+    typename E, typename F, typename G, typename H,
+    typename I, typename J, typename K, typename L,
+    typename M, typename N, typename O, typename P>
+CONSTEXPR TMat44<T>::TMat44(
+        A m00, B m01, C m02, D m03,
+        E m10, F m11, G m12, H m13,
+        I m20, J m21, K m22, L m23,
+        M m30, N m31, O m32, P m33) {
+    m_value[0] = col_type(m00, m01, m02, m03);
+    m_value[1] = col_type(m10, m11, m12, m13);
+    m_value[2] = col_type(m20, m21, m22, m23);
+    m_value[3] = col_type(m30, m31, m32, m33);
+}
+
+template <typename T>
+template <typename U>
+CONSTEXPR TMat44<T>::TMat44(const TMat44<U>& rhs) {
+    for (size_t col = 0; col < NUM_COLS; ++col) {
+        m_value[col] = col_type(rhs[col]);
+    }
+}
+
+// Construct from 4 column vectors.
+template <typename T>
+template <typename A, typename B, typename C, typename D>
+CONSTEXPR TMat44<T>::TMat44(
+        const TVec4<A>& v0, const TVec4<B>& v1,
+        const TVec4<C>& v2, const TVec4<D>& v3) {
+    m_value[0] = col_type(v0);
+    m_value[1] = col_type(v1);
+    m_value[2] = col_type(v2);
+    m_value[3] = col_type(v3);
+}
+
+// Construct from raw array, in column-major form.
+template <typename T>
+template <typename U>
+CONSTEXPR TMat44<T>::TMat44(U const* rawArray) {
+    for (size_t col = 0; col < NUM_COLS; ++col) {
+        for (size_t row = 0; row < NUM_ROWS; ++row) {
+            m_value[col][row] = *rawArray++;
+        }
+    }
+}
+// ----------------------------------------------------------------------------------------
+// Helpers
+// ----------------------------------------------------------------------------------------
+
+template <typename T>
+CONSTEXPR TMat44<T> TMat44<T>::ortho(T left, T right, T bottom, T top, T near, T far) {
+    TMat44<T> m;
+    m[0][0] =  2 / (right - left);
+    m[1][1] =  2 / (top   - bottom);
+    m[2][2] = -2 / (far   - near);
+    m[3][0] = -(right + left)   / (right - left);
+    m[3][1] = -(top   + bottom) / (top   - bottom);
+    m[3][2] = -(far   + near)   / (far   - near);
+    return m;
+}
+
+template <typename T>
+CONSTEXPR TMat44<T> TMat44<T>::frustum(T left, T right, T bottom, T top, T near, T far) {
+    TMat44<T> m;
+    m[0][0] =  (2 * near) / (right - left);
+    m[1][1] =  (2 * near) / (top   - bottom);
+    m[2][0] =  (right + left)   / (right - left);
+    m[2][1] =  (top   + bottom) / (top   - bottom);
+    m[2][2] = -(far   + near)   / (far   - near);
+    m[2][3] = -1;
+    m[3][2] = -(2 * far * near) / (far   - near);
+    m[3][3] =  0;
+    return m;
+}
+
+template <typename T>
+CONSTEXPR TMat44<T> TMat44<T>::perspective(T fov, T aspect, T near, T far, TMat44::Fov direction) {
+    T h;
+    T w;
+
+    if (direction == TMat44::Fov::VERTICAL) {
+        h = std::tan(fov * M_PI / 360.0f) * near;
+        w = h * aspect;
+    } else {
+        w = std::tan(fov * M_PI / 360.0f) * near;
+        h = w / aspect;
+    }
+    return frustum(-w, w, -h, h, near, far);
+}
+
+/*
+ * Returns a matrix representing the pose of a virtual camera looking towards -Z in its
+ * local Y-up coordinate system. "eye" is where the camera is located, "center" is the points its
+ * looking at and "up" defines where the Y axis of the camera's local coordinate system is.
+ */
+template <typename T>
+template <typename A, typename B, typename C>
+CONSTEXPR TMat44<T> TMat44<T>::lookAt(const TVec3<A>& eye, const TVec3<B>& center, const TVec3<C>& up) {
+    TVec3<T> z_axis(normalize(center - eye));
+    TVec3<T> norm_up(normalize(up));
+    if (std::abs(dot(z_axis, norm_up)) > 0.999) {
+        // Fix up vector if we're degenerate (looking straight up, basically)
+        norm_up = { norm_up.z, norm_up.x, norm_up.y };
+    }
+    TVec3<T> x_axis(normalize(cross(z_axis, norm_up)));
+    TVec3<T> y_axis(cross(x_axis, z_axis));
+    return TMat44<T>(
+            TVec4<T>(x_axis, 0),
+            TVec4<T>(y_axis, 0),
+            TVec4<T>(-z_axis, 0),
+            TVec4<T>(eye, 1));
+}
+
+// ----------------------------------------------------------------------------------------
+// Arithmetic operators outside of class
+// ----------------------------------------------------------------------------------------
+
+/* We use non-friend functions here to prevent the compiler from using
+ * implicit conversions, for instance of a scalar to a vector. The result would
+ * not be what the caller expects.
+ *
+ * Also note that the order of the arguments in the inner loop is important since
+ * it determines the output type (only relevant when T != U).
+ */
+
+// matrix * column-vector, result is a vector of the same type than the input vector
+template <typename T, typename U>
+CONSTEXPR typename TMat44<T>::col_type PURE operator *(const TMat44<T>& lhs, const TVec4<U>& rhs) {
+    // Result is initialized to zero.
+    typename TMat44<T>::col_type result;
+    for (size_t col = 0; col < TMat44<T>::NUM_COLS; ++col) {
+        result += lhs[col] * rhs[col];
+    }
+    return result;
+}
+
+// mat44 * vec3, result is vec3( mat44 * {vec3, 1} )
+template <typename T, typename U>
+CONSTEXPR typename TMat44<T>::col_type PURE operator *(const TMat44<T>& lhs, const TVec3<U>& rhs) {
+    return lhs * TVec4<U>{ rhs, 1 };
+}
+
+
+// row-vector * matrix, result is a vector of the same type than the input vector
+template <typename T, typename U>
+CONSTEXPR typename TMat44<U>::row_type PURE operator *(const TVec4<U>& lhs, const TMat44<T>& rhs) {
+    typename TMat44<U>::row_type result(TMat44<U>::row_type::NO_INIT);
+    for (size_t col = 0; col < TMat44<T>::NUM_COLS; ++col) {
+        result[col] = dot(lhs, rhs[col]);
+    }
+    return result;
+}
+
+// matrix * scalar, result is a matrix of the same type than the input matrix
+template <typename T, typename U>
+constexpr typename std::enable_if<std::is_arithmetic<U>::value, TMat44<T>>::type PURE
+operator *(TMat44<T> lhs, U rhs) {
+    return lhs *= rhs;
+}
+
+// scalar * matrix, result is a matrix of the same type than the input matrix
+template <typename T, typename U>
+constexpr typename std::enable_if<std::is_arithmetic<U>::value, TMat44<T>>::type PURE
+operator *(U lhs, const TMat44<T>& rhs) {
+    return rhs * lhs;
+}
+
+
+template<typename T>
+typename TMat44<T>::col_type PURE diag(const TMat44<T>& m) {
+    return matrix::diag(m);
+}
+
+
+}  // namespace details
+
+typedef details::TVec4<float> float4;
+typedef details::TMat44<float> mat4;
+
+int main(int, char**) {
+	mat4 mClientColorMatrix(1.f * random());
+	float4 lastRow(transpose(mClientColorMatrix)[3]);
+	if (any(greaterThan(abs(lastRow - float4{0.f, 0.f, 0.f, 1.f}), float4{1e-4f}))) {
+		printf("The color transform's last row must be (0, 0, 0, 1)\n");
+	} else {
+		printf("Hm.\n");
+	}
+}
\ No newline at end of file
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-conversion.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-conversion.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-conversion.cpp
@@ -0,0 +1,53 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O0 %s -o %t
+
+// RUN: not %run %t 'A' 2>&1 | FileCheck %s
+// RUN: not %run %t 'B' 2>&1 | FileCheck %s
+
+// Missing lifetime markers in test_a
+// https://bugs.llvm.org/show_bug.cgi?id=34353
+// XFAIL: *
+
+struct B {
+  B() : p('B') {}
+  char p;
+};
+
+struct C {
+  const char *p;
+  explicit C(const char *c) : p(c) {}
+  explicit C(const B &b) : p(&b.p) {}
+};
+
+struct A {
+  char p;
+  A() : p('C') {}
+  const operator C() const { return C(&p); }
+};
+
+volatile char r;
+void test_a() {
+  C s = A();
+  r = *s.p;
+}
+
+void test_b() {
+  C s = B();
+  r = *s.p;
+}
+
+int main(int argc, char **argv) {
+  switch (argv[1][0]) {
+  case 'A':
+    test_a();
+    return 0;
+  case 'B':
+    test_b();
+    return 0;
+  }
+  return 1;
+}
+
+// CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+// CHECK: Cause: stack tag-mismatch
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-dtor-order.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-dtor-order.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-dtor-order.cpp
@@ -0,0 +1,30 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O1 %s -o %t && \
+// RUN:     not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+#include <stdio.h>
+
+struct IntHolder {
+  explicit IntHolder(int *val = 0) : val_(val) {}
+  __attribute__((noinline)) ~IntHolder() {
+    printf("Value: %d\n", *val_); // BOOM
+    // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+    // CHECK:  #0 0x{{.*}} in IntHolder::~IntHolder{{.*}}.cpp:[[@LINE-2]]
+  }
+  void set(int *val) { val_ = val; }
+  int *get() { return val_; }
+
+  int *val_;
+};
+
+int main(int argc, char *argv[]) {
+  // It is incorrect to use "x" int IntHolder destructor, because "x" is
+  // "destroyed" earlier as it's declared later.
+  IntHolder holder;
+  int x = argc;
+  holder.set(&x);
+  return 0;
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-goto.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-goto.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-goto.cpp
@@ -0,0 +1,77 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O0 %s -o %t && %run %t
+
+// Function jumps over variable initialization making lifetime analysis
+// ambiguous. Asan should ignore such variable and program must not fail.
+
+// REQUIRES: aarch64-target-arch
+
+#include <stdlib.h>
+
+int *ptr;
+
+void f1(int cond) {
+  if (cond)
+    goto label;
+  int tmp;
+
+label:
+  ptr = &tmp;
+  *ptr = 5;
+}
+
+void f2(int cond) {
+  switch (cond) {
+  case 1: {
+    ++cond;
+    int tmp;
+    ptr = &tmp;
+    exit(0);
+  case 2:
+    ptr = &tmp;
+    *ptr = 5;
+    exit(0);
+  }
+  }
+}
+
+void f3(int cond) {
+  {
+    int tmp;
+    goto l2;
+  l1:
+    ptr = &tmp;
+    *ptr = 5;
+
+    exit(0);
+  }
+l2:
+  goto l1;
+}
+
+void use(int *x) {
+  static int c = 10;
+  if (--c == 0)
+    exit(0);
+  (*x)++;
+}
+
+void f4() {
+  {
+    int x;
+  l2:
+    use(&x);
+    goto l1;
+  }
+l1:
+  goto l2;
+}
+
+int main() {
+  f1(1);
+  f2(1);
+  f3(1);
+  f4();
+  return 0;
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-if.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-if.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-if.cpp
@@ -0,0 +1,20 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O1 %s -o %t && \
+// RUN:     not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+int *p;
+bool b = true;
+
+int main() {
+  if (b) {
+    int x[5];
+    p = x + 1;
+  }
+  return *p; // BOOM
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+  // CHECK:  #0 0x{{.*}} in main {{.*}}.cpp:[[@LINE-2]]
+  // CHECK: Cause: stack tag-mismatch
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-inlined.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-inlined.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-inlined.cpp
@@ -0,0 +1,25 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// Test with "-O2" only to make sure inlining (leading to use-after-scope)
+// happens. "always_inline" is not enough, as Clang doesn't emit
+// llvm.lifetime intrinsics at -O0.
+//
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O2 %s -o %t && \
+// RUN:     not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+int *arr;
+__attribute__((always_inline)) void inlined(int arg) {
+  int x[5];
+  for (int i = 0; i < arg; i++)
+    x[i] = i;
+  arr = x;
+}
+
+int main(int argc, char *argv[]) {
+  inlined(argc);
+  return arr[argc - 1]; // BOOM
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+  // CHECK: Cause: stack tag-mismatch
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-loop-bug.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-loop-bug.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-loop-bug.cpp
@@ -0,0 +1,20 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O1 %s -o %t && \
+// RUN:     not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+volatile int *p;
+
+int main() {
+  // Variable goes in and out of scope.
+  for (int i = 0; i < 3; ++i) {
+    int x[3] = {i, i, i};
+    p = x + i;
+  }
+  return *p; // BOOM
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+  // CHECK:  #0 0x{{.*}} in main {{.*}}use-after-scope-loop-bug.cpp:[[@LINE-2]]
+  // CHECK: Cause: stack tag-mismatch
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-loop-removed.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-loop-removed.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-loop-removed.cpp
@@ -0,0 +1,21 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O1 %s -o %t && \
+// RUN:     not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+#include <stdlib.h>
+
+int *p;
+
+int main() {
+  for (int i = 0; i < 3; i++) {
+    int x;
+    p = &x;
+  }
+  return *p; // BOOM
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+  // CHECK:  #0 0x{{.*}} in main {{.*}}use-after-scope-loop-removed.cpp:[[@LINE-2]]
+  // CHECK: Cause: stack tag-mismatch
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-loop.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-loop.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-loop.cpp
@@ -0,0 +1,19 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O1 %s -o %t && \
+// RUN:     not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+int *p[3];
+
+int main() {
+  for (int i = 0; i < 3; i++) {
+    int x;
+    p[i] = &x;
+  }
+  return **p; // BOOM
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+  // CHECK: #0 0x{{.*}} in main {{.*}}.cpp:[[@LINE-2]]
+  // CHECK: Cause: stack tag-mismatch
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-nobug.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-nobug.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-nobug.cpp
@@ -0,0 +1,20 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O1 %s -o %t && %run %t
+
+// REQUIRES: aarch64-target-arch
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int *p[3];
+
+int main() {
+  // Variable goes in and out of scope.
+  for (int i = 0; i < 3; i++) {
+    int x;
+    p[i] = &x;
+  }
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-temp.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-temp.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-temp.cpp
@@ -0,0 +1,25 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -std=c++11 -O1 %s -o %t && \
+// RUN:     not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+struct IntHolder {
+  int val;
+};
+
+const IntHolder *saved;
+
+__attribute__((noinline)) void save(const IntHolder &holder) {
+  saved = &holder;
+}
+
+int main(int argc, char *argv[]) {
+  save({argc});
+  int x = saved->val; // BOOM
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+  // CHECK:  #0 0x{{.*}} in main {{.*}}use-after-scope-temp.cpp:[[@LINE-2]]
+  // CHECK: Cause: stack tag-mismatch
+  return x;
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-temp2.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-temp2.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-temp2.cpp
@@ -0,0 +1,24 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -std=c++11 -O1 %s -o %t && \
+// RUN:     not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+struct IntHolder {
+  __attribute__((noinline)) const IntHolder &Self() const {
+    return *this;
+  }
+  int val = 3;
+};
+
+const IntHolder *saved;
+
+int main(int argc, char *argv[]) {
+  saved = &IntHolder().Self();
+  int x = saved->val; // BOOM
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+  // CHECK:  #0 0x{{.*}} in main {{.*}}use-after-scope-temp2.cpp:[[@LINE-2]]
+  // CHECK: Cause: stack tag-mismatch
+  return x;
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope-types.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope-types.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope-types.cpp
@@ -0,0 +1,81 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -std=c++11 -O0 %s -o %t
+// RUN: not %run %t 0 2>&1 | FileCheck %s
+// RUN: not %run %t 1 2>&1 | FileCheck %s
+// RUN: not %run %t 2 2>&1 | FileCheck %s
+// RUN: not %run %t 3 2>&1 | FileCheck %s
+// RUN: not %run %t 4 2>&1 | FileCheck %s
+// RUN: not %run %t 5 2>&1 | FileCheck %s
+// The std::vector case is broken because of limited lifetime tracking.
+// TODO(fmayer): Fix and enable.
+// RUN: not %run %t 7 2>&1 | FileCheck %s
+// RUN: not %run %t 8 2>&1 | FileCheck %s
+// RUN: not %run %t 9 2>&1 | FileCheck %s
+// RUN: not %run %t 10 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+#include <stdlib.h>
+#include <string>
+#include <vector>
+
+template <class T>
+struct Ptr {
+  void Store(T *ptr) { t = ptr; }
+
+  void Access() { *t = {}; }
+
+  T *t;
+};
+
+template <class T, size_t N>
+struct Ptr<T[N]> {
+  using Type = T[N];
+  void Store(Type *ptr) { t = *ptr; }
+
+  void Access() { *t = {}; }
+
+  T *t;
+};
+
+template <class T>
+__attribute__((noinline)) void test() {
+  Ptr<T> ptr;
+  {
+    T x;
+    ptr.Store(&x);
+  }
+
+  ptr.Access();
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+  // CHECK:  #{{[0-9]+}} 0x{{.*}} in {{(void )?test.*\((void)?\) .*}}use-after-scope-types.cpp
+  // CHECK: Cause: stack tag-mismatch
+}
+
+int main(int argc, char **argv) {
+  using Tests = void (*)();
+  Tests tests[] = {
+      &test<bool>,
+      &test<char>,
+      &test<int>,
+      &test<double>,
+      &test<float>,
+      &test<void *>,
+      &test<std::vector<std::string>>,
+      &test<int[3]>,
+      &test<int[1000]>,
+      &test<char[3]>,
+      &test<char[1000]>,
+  };
+
+  int n = atoi(argv[1]);
+  if (n == sizeof(tests) / sizeof(tests[0])) {
+    for (auto te : tests)
+      te();
+  } else {
+    tests[n]();
+  }
+
+  return 0;
+}
diff --git a/compiler-rt/test/hwasan/TestCases/use-after-scope.cpp b/compiler-rt/test/hwasan/TestCases/use-after-scope.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/use-after-scope.cpp
@@ -0,0 +1,20 @@
+// This is the ASAN test of the same name ported to HWAsan.
+
+// RUN: %clangxx_hwasan -mllvm -hwasan-use-after-scope -O1 %s -o %t && \
+// RUN:     not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: aarch64-target-arch
+
+volatile int *p = 0;
+
+int main() {
+  {
+    int x = 0;
+    p = &x;
+  }
+  *p = 5; // BOOM
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch
+  // CHECK:  #0 0x{{.*}} in main {{.*}}use-after-scope.cpp:[[@LINE-2]]
+  // CHECK: Cause: stack tag-mismatch
+  return 0;
+}
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
@@ -14,7 +14,11 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZERCOMMON_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZERCOMMON_H
 
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 
 namespace llvm {
@@ -44,6 +48,38 @@
   Value *getPtr() { return PtrUse->get(); }
 };
 
+template <typename F>
+void tagLifetimeEnd(DominatorTree *DT, PostDominatorTree *PDT,
+                    IntrinsicInst *Start, IntrinsicInst *End,
+                    SmallVectorImpl<Instruction *> &RetVec, F UntagCallback) {
+  // We need to ensure that if we tag some object, we certainly untag it
+  // before the function exits.
+  if (PDT != nullptr && PDT->dominates(End, Start)) {
+    UntagCallback(End);
+  } else {
+    SmallVector<Instruction *, 8> ReachableRetVec;
+    unsigned NumCoveredExits = 0;
+    for (auto &RI : RetVec) {
+      if (!isPotentiallyReachable(Start, RI, nullptr, DT))
+        continue;
+      ReachableRetVec.push_back(RI);
+      if (DT != nullptr && DT->dominates(End, RI))
+        ++NumCoveredExits;
+    }
+    // If there's a mix of covered and non-covered exits, just put the untag
+    // on exits, so we avoid the redundancy of untagging twice.
+    if (NumCoveredExits == ReachableRetVec.size()) {
+      UntagCallback(End);
+    } else {
+      for (auto &RI : ReachableRetVec)
+        UntagCallback(RI);
+      // We may have inserted untag outside of the lifetime interval.
+      // Remove the lifetime end call for this alloca.
+      End->eraseFromParent();
+    }
+  }
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -42,17 +42,18 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <iterator>
@@ -648,32 +649,8 @@
           cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
       Size = alignTo(Size, kTagGranuleSize);
       tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
-      // We need to ensure that if we tag some object, we certainly untag it
-      // before the function exits.
-      if (PDT != nullptr && PDT->dominates(End, Start)) {
-        untagAlloca(AI, End, Size);
-      } else {
-        SmallVector<Instruction *, 8> ReachableRetVec;
-        unsigned NumCoveredExits = 0;
-        for (auto &RI : RetVec) {
-          if (!isPotentiallyReachable(Start, RI, nullptr, DT))
-            continue;
-          ReachableRetVec.push_back(RI);
-          if (DT != nullptr && DT->dominates(End, RI))
-            ++NumCoveredExits;
-        }
-        // If there's a mix of covered and non-covered exits, just put the untag
-        // on exits, so we avoid the redundancy of untagging twice.
-        if (NumCoveredExits == ReachableRetVec.size()) {
-          untagAlloca(AI, End, Size);
-        } else {
-          for (auto &RI : ReachableRetVec)
-            untagAlloca(AI, RI, Size);
-          // We may have inserted untag outside of the lifetime interval.
-          // Remove the lifetime end call for this alloca.
-          End->eraseFromParent();
-        }
-      }
+      tagLifetimeEnd(DT, PDT, Start, End, RetVec,
+                     [&](Instruction *Node) { untagAlloca(AI, Node, Size); });
     } else {
       uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
       Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy());
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -17,7 +17,9 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -26,6 +28,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
@@ -41,6 +44,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -115,6 +119,11 @@
                      cl::Hidden, cl::desc("Use Stack Safety analysis results"),
                      cl::Optional);
 
+static cl::opt<bool>
+    ClUseAfterScope("hwasan-use-after-scope",
+                    cl::desc("detect use after scope within function"),
+                    cl::Hidden, cl::init(false));
+
 static cl::opt<bool> ClUARRetagToZero(
     "hwasan-uar-retag-to-zero",
     cl::desc("Clear alloca tags before returning from the function to allow "
@@ -225,9 +234,22 @@
 // No one should use the option directly.
 #pragma GCC poison ClUseStackSafety
 }
+
+bool shouldDetectUseAfterScope(const Triple &TargetTriple) {
+  return ClUseAfterScope && shouldInstrumentStack(TargetTriple);
+#pragma GCC poison ClUseAfterScope
+}
+
 /// An instrumentation pass implementing detection of addressability bugs
 /// using tagged pointers.
 class HWAddressSanitizer {
+private:
+  struct AllocaInfo {
+    AllocaInst *AI;
+    SmallVector<IntrinsicInst *, 2> LifetimeStart;
+    SmallVector<IntrinsicInst *, 2> LifetimeEnd;
+  };
+
 public:
   explicit HWAddressSanitizer(Module &M, bool CompileKernel = false,
                               bool Recover = false,
@@ -243,7 +265,7 @@
 
   void setSSI(const StackSafetyGlobalInfo *S) { SSI = S; }
 
-  bool sanitizeFunction(Function &F);
+  bool sanitizeFunction(Function &F, DominatorTree *DT, PostDominatorTree *PDT);
   void initializeModule();
   void createHwasanCtorComdat();
 
@@ -266,13 +288,15 @@
       Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
 
   bool isInterestingAlloca(const AllocaInst &AI);
-  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
+  void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
   Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
   Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
   bool instrumentStack(
-      SmallVectorImpl<AllocaInst *> &Allocas,
+      MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument,
+      SmallVector<Instruction *, 4> &UnrecognizedLifetimes,
       DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
-      SmallVectorImpl<Instruction *> &RetVec, Value *StackTag);
+      SmallVectorImpl<Instruction *> &RetVec, Value *StackTag,
+      DominatorTree *DT, PostDominatorTree *PDT);
   Value *readRegister(IRBuilder<> &IRB, StringRef Name);
   bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
   Value *getNextTagWithCall(IRBuilder<> &IRB);
@@ -321,6 +345,7 @@
     void init(Triple &TargetTriple, bool InstrumentWithCalls);
     unsigned getObjectAlignment() const { return 1U << Scale; }
   };
+
   ShadowMapping Mapping;
 
   Type *VoidTy = Type::getVoidTy(M.getContext());
@@ -337,6 +362,7 @@
   bool InstrumentLandingPads;
   bool InstrumentWithCalls;
   bool InstrumentStack;
+  bool DetectUseAfterScope;
   bool UsePageAliases;
 
   bool HasMatchAllTag = false;
@@ -390,7 +416,16 @@
       HWASan->setSSI(
           &getAnalysis<StackSafetyGlobalInfoWrapperPass>().getResult());
     }
-    return HWASan->sanitizeFunction(F);
+    DominatorTree *DT = nullptr;
+    PostDominatorTree *PDT = nullptr;
+    if (shouldDetectUseAfterScope(TargetTriple)) {
+      if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+        DT = &P->getDomTree();
+
+      if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>())
+        PDT = &P->getPostDomTree();
+    }
+    return HWASan->sanitizeFunction(F, DT, PDT);
   }
 
   bool doFinalization(Module &M) override {
@@ -443,12 +478,25 @@
 PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
                                               ModuleAnalysisManager &MAM) {
   const StackSafetyGlobalInfo *SSI = nullptr;
+  DominatorTree *DT = nullptr;
+  PostDominatorTree *PDT = nullptr;
+
   if (shouldUseStackSafetyAnalysis(TargetTriple, DisableOptimization))
     SSI = &MAM.getResult<StackSafetyGlobalAnalysis>(M);
+
   HWAddressSanitizer HWASan(M, CompileKernel, Recover, SSI);
   bool Modified = false;
-  for (Function &F : M)
-    Modified |= HWASan.sanitizeFunction(F);
+  for (Function &F : M) {
+    if (shouldDetectUseAfterScope(TargetTriple)) {
+      auto &FAM =
+          MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+      // If the analysis is not cached, we only run it for the
+      // functions that have interesting allocas.
+      DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+      PDT = FAM.getCachedResult<PostDominatorTreeAnalysis>(F);
+    }
+    Modified |= HWASan.sanitizeFunction(F, DT, PDT);
+  }
   if (Modified)
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
@@ -570,6 +618,7 @@
   UsePageAliases = shouldUsePageAliases(TargetTriple);
   InstrumentWithCalls = shouldInstrumentWithCalls(TargetTriple);
   InstrumentStack = shouldInstrumentStack(TargetTriple);
+  DetectUseAfterScope = shouldDetectUseAfterScope(TargetTriple);
   PointerTagShift = IsX86_64 ? 57 : 56;
   TagMaskByte = IsX86_64 ? 0x3F : 0xFF;
 
@@ -972,7 +1021,7 @@
   return SizeInBytes * ArraySize;
 }
 
-bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag,
+void HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag,
                                    size_t Size) {
   size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
   if (!UseShortGranules)
@@ -1003,7 +1052,6 @@
                                    AlignedSize - 1));
     }
   }
-  return true;
 }
 
 unsigned HWAddressSanitizer::retagMask(unsigned AllocaNo) {
@@ -1236,16 +1284,22 @@
 }
 
 bool HWAddressSanitizer::instrumentStack(
-    SmallVectorImpl<AllocaInst *> &Allocas,
+    MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument,
+    SmallVector<Instruction *, 4> &UnrecognizedLifetimes,
     DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
-    SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) {
+    SmallVectorImpl<Instruction *> &RetVec, Value *StackTag, DominatorTree *DT,
+    PostDominatorTree *PDT) {
   // Ideally, we want to calculate tagged stack base pointer, and rewrite all
   // alloca addresses using that. Unfortunately, offsets are not known yet
   // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
   // temp, shift-OR it into each alloca address and xor with the retag mask.
   // This generates one extra instruction per alloca use.
-  for (unsigned N = 0; N < Allocas.size(); ++N) {
-    auto *AI = Allocas[N];
+  unsigned int N = 0;
+
+  for (auto It = AllocasToInstrument.begin(); It < AllocasToInstrument.end();
+       ++It, ++N) {
+    auto *AI = It->first;
+    AllocaInfo &Info = It->second;
     IRBuilder<> IRB(AI->getNextNode());
 
     // Replace uses of the alloca with tagged address.
@@ -1272,17 +1326,37 @@
     }
 
     size_t Size = getAllocaSizeInBytes(*AI);
-    tagAlloca(IRB, AI, Tag, Size);
-
-    for (auto RI : RetVec) {
-      IRB.SetInsertPoint(RI);
-
-      // Re-tag alloca memory with the special UAR tag.
-      Value *Tag = getUARTag(IRB, StackTag);
-      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment()));
+    size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+    bool StandardLifetime = UnrecognizedLifetimes.empty() &&
+                            Info.LifetimeStart.size() == 1 &&
+                            Info.LifetimeEnd.size() == 1;
+    if (DetectUseAfterScope && StandardLifetime) {
+      IntrinsicInst *Start = Info.LifetimeStart[0];
+      IntrinsicInst *End = Info.LifetimeEnd[0];
+      IRB.SetInsertPoint(Start->getNextNode());
+      tagAlloca(IRB, AI, Tag, Size);
+      tagLifetimeEnd(DT, PDT, Start, End, RetVec, [&](Instruction *Node) {
+        IRB.SetInsertPoint(End);
+        Value *UARTag = getUARTag(IRB, StackTag);
+        tagAlloca(IRB, AI, UARTag, AlignedSize);
+      });
+    } else {
+      tagAlloca(IRB, AI, Tag, Size);
+      for (auto *RI : RetVec) {
+        IRB.SetInsertPoint(RI);
+        Value *UARTag = getUARTag(IRB, StackTag);
+        tagAlloca(IRB, AI, UARTag, AlignedSize);
+      }
+      if (!StandardLifetime) {
+        for (auto &II : Info.LifetimeStart)
+          II->eraseFromParent();
+        for (auto &II : Info.LifetimeEnd)
+          II->eraseFromParent();
+      }
     }
   }
-
+  for (auto &I : UnrecognizedLifetimes)
+    I->eraseFromParent();
   return true;
 }
 
@@ -1304,7 +1378,8 @@
          !(SSI && SSI->isSafe(AI));
 }
 
-bool HWAddressSanitizer::sanitizeFunction(Function &F) {
+bool HWAddressSanitizer::sanitizeFunction(Function &F, DominatorTree *DT,
+                                          PostDominatorTree *PDT) {
   if (&F == HwasanCtorFunction)
     return false;
 
@@ -1315,18 +1390,36 @@
 
   SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
   SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
-  SmallVector<AllocaInst *, 8> AllocasToInstrument;
+  MapVector<AllocaInst *, AllocaInfo> AllocasToInstrument;
   SmallVector<Instruction *, 8> RetVec;
   SmallVector<Instruction *, 8> LandingPadVec;
+  SmallVector<Instruction *, 4> UnrecognizedLifetimes;
   DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap;
   for (auto &BB : F) {
     for (auto &Inst : BB) {
-      if (InstrumentStack)
+      if (InstrumentStack) {
         if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
           if (isInterestingAlloca(*AI))
-            AllocasToInstrument.push_back(AI);
+            AllocasToInstrument.insert({AI, {}});
           continue;
         }
+        auto *II = dyn_cast<IntrinsicInst>(&Inst);
+        if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+                   II->getIntrinsicID() == Intrinsic::lifetime_end)) {
+          AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
+          if (!AI) {
+            UnrecognizedLifetimes.push_back(&Inst);
+            continue;
+          }
+          if (!isInterestingAlloca(*AI))
+            continue;
+          if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+            AllocasToInstrument[AI].LifetimeStart.push_back(II);
+          else
+            AllocasToInstrument[AI].LifetimeEnd.push_back(II);
+          continue;
+        }
+      }
 
       if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) ||
           isa<CleanupReturnInst>(Inst))
@@ -1379,15 +1472,32 @@
                    Mapping.WithFrameRecord && !AllocasToInstrument.empty());
 
   if (!AllocasToInstrument.empty()) {
+    std::unique_ptr<DominatorTree> DeleteDT;
+    DominatorTree *LocalDT = DT;
+    if (LocalDT == nullptr && DetectUseAfterScope) {
+      DeleteDT = std::make_unique<DominatorTree>(F);
+      LocalDT = DeleteDT.get();
+    }
+
+    std::unique_ptr<PostDominatorTree> DeletePDT;
+    PostDominatorTree *LocalPDT = PDT;
+    if (LocalPDT == nullptr && DetectUseAfterScope) {
+      DeletePDT = std::make_unique<PostDominatorTree>(F);
+      LocalPDT = DeletePDT.get();
+    }
+
     Value *StackTag =
         ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
-    instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec, StackTag);
+    instrumentStack(AllocasToInstrument, UnrecognizedLifetimes, AllocaDbgMap,
+                    RetVec, StackTag, LocalDT, LocalPDT);
   }
   // Pad and align each of the allocas that we instrumented to stop small
   // uninteresting allocas from hiding in instrumented alloca's padding and so
   // that we have enough space to store real tags for short granules.
   DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
-  for (AllocaInst *AI : AllocasToInstrument) {
+  for (auto It = AllocasToInstrument.begin(); It != AllocasToInstrument.end();
+       ++It) {
+    AllocaInst *AI = It->first;
     uint64_t Size = getAllocaSizeInBytes(*AI);
     uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
     AI->setAlignment(