diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp
--- a/clang/unittests/Tooling/Syntax/TreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp
@@ -121,6 +121,16 @@
     return Root;
   }
 
+  void expectTreeDumpEqual(StringRef code, StringRef tree) {
+    SCOPED_TRACE(code);
+
+    auto *Root = buildTree(code);
+    std::string Expected = tree.trim().str();
+    std::string Actual =
+        std::string(llvm::StringRef(Root->dump(*Arena)).trim());
+    EXPECT_EQ(Expected, Actual) << "the resulting dump is:\n" << Actual;
+  }
+
   // Adds a file to the test VFS.
   void addFile(llvm::StringRef Path, llvm::StringRef Contents) {
     if (!FS->addFile(Path, time_t(),
@@ -164,14 +174,13 @@
   std::unique_ptr<syntax::Arena> Arena;
 };
 
-TEST_F(SyntaxTreeTest, Basic) {
-  std::pair</*Input*/ std::string, /*Expected*/ std::string> Cases[] = {
-      {
-          R"cpp(
+TEST_F(SyntaxTreeTest, Simple) {
+  expectTreeDumpEqual(
+      R"cpp(
 int main() {}
 void foo() {}
     )cpp",
-          R"txt(
+      R"txt(
 *: TranslationUnit
 |-SimpleDeclaration
 | |-int
@@ -193,16 +202,18 @@
   `-CompoundStatement
     |-{
     `-}
-)txt"},
-      // if.
-      {
-          R"cpp(
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, If) {
+  expectTreeDumpEqual(
+      R"cpp(
 int main() {
   if (true) {}
   if (true) {} else if (false) {}
 }
         )cpp",
-          R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-int
@@ -242,14 +253,17 @@
     |     |-{
     |     `-}
     `-}
-        )txt"},
-      // for.
-      {R"cpp(
+        )txt");
+}
+
+TEST_F(SyntaxTreeTest, For) {
+  expectTreeDumpEqual(
+      R"cpp(
 void test() {
   for (;;)  {}
 }
 )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -270,10 +284,18 @@
     |   |-{
     |   `-}
     `-}
-        )txt"},
-      // declaration statement.
-      {"void test() { int a = 10; }",
-       R"txt(
+        )txt");
+}
+
+TEST_F(SyntaxTreeTest, RangeBasedFor) {
+  expectTreeDumpEqual(
+      R"cpp(
+void test() {
+  int a[3];
+  for (int x : a) ;
+}
+      )cpp",
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -289,13 +311,32 @@
     | | |-int
     | | `-SimpleDeclarator
     | |   |-a
-    | |   |-=
-    | |   `-UnknownExpression
-    | |     `-10
+    | |   `-ArraySubscript
+    | |     |-[
+    | |     |-UnknownExpression
+    | |     | `-3
+    | |     `-]
     | `-;
+    |-RangeBasedForStatement
+    | |-for
+    | |-(
+    | |-SimpleDeclaration
+    | | |-int
+    | | |-SimpleDeclarator
+    | | | `-x
+    | | `-:
+    | |-UnknownExpression
+    | | `-a
+    | |-)
+    | `-EmptyStatement
+    |   `-;
     `-}
-)txt"},
-      {"void test() { ; }", R"txt(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, DeclarationStatement) {
+  expectTreeDumpEqual("void test() { int a = 10; }",
+                      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -306,12 +347,22 @@
   |   `-)
   `-CompoundStatement
     |-{
-    |-EmptyStatement
+    |-DeclarationStatement
+    | |-SimpleDeclaration
+    | | |-int
+    | | `-SimpleDeclarator
+    | |   |-a
+    | |   |-=
+    | |   `-UnknownExpression
+    | |     `-10
     | `-;
     `-}
-)txt"},
-      // switch, case and default.
-      {R"cpp(
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, Switch) {
+  expectTreeDumpEqual(
+      R"cpp(
 void test() {
   switch (true) {
     case 0:
@@ -319,7 +370,7 @@
   }
 }
 )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -350,14 +401,17 @@
     |   |     `-;
     |   `-}
     `-}
-)txt"},
-      // while.
-      {R"cpp(
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, While) {
+  expectTreeDumpEqual(
+      R"cpp(
 void test() {
   while (true) { continue; break; }
 }
 )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -384,77 +438,15 @@
     |   | `-;
     |   `-}
     `-}
-)txt"},
-      // return.
-      {R"cpp(
-int test() { return 1; }
-      )cpp",
-       R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-int
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ReturnStatement
-    | |-return
-    | |-UnknownExpression
-    | | `-1
-    | `-;
-    `-}
-)txt"},
-      // Range-based for.
-      {R"cpp(
-void test() {
-  int a[3];
-  for (int x : a) ;
+)txt");
 }
-      )cpp",
-       R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-DeclarationStatement
-    | |-SimpleDeclaration
-    | | |-int
-    | | `-SimpleDeclarator
-    | |   |-a
-    | |   `-ArraySubscript
-    | |     |-[
-    | |     |-UnknownExpression
-    | |     | `-3
-    | |     `-]
-    | `-;
-    |-RangeBasedForStatement
-    | |-for
-    | |-(
-    | |-SimpleDeclaration
-    | | |-int
-    | | |-SimpleDeclarator
-    | | | `-x
-    | | `-:
-    | |-UnknownExpression
-    | | `-a
-    | |-)
-    | `-EmptyStatement
-    |   `-;
-    `-}
-       )txt"},
-      // Unhandled statements should end up as 'unknown statement'.
-      // This example uses a 'label statement', which does not yet have a syntax
-      // counterpart.
-      {"void main() { foo: return 100; }", R"txt(
+
+TEST_F(SyntaxTreeTest, UnhandledStatement) {
+  // Unhandled statements should end up as 'unknown statement'.
+  // This example uses a 'label statement', which does not yet have a syntax
+  // counterpart.
+  expectTreeDumpEqual("void main() { foo: return 100; }",
+                      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -474,16 +466,20 @@
     |   | `-100
     |   `-;
     `-}
-)txt"},
-      // expressions should be wrapped in 'ExpressionStatement' when they appear
-      // in a statement position.
-      {R"cpp(
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, Expressions) {
+  // expressions should be wrapped in 'ExpressionStatement' when they appear
+  // in a statement position.
+  expectTreeDumpEqual(
+      R"cpp(
 void test() {
   test();
   if (true) test(); else test();
 }
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -520,12 +516,15 @@
     |   | `-)
     |   `-;
     `-}
-)txt"},
-      // Multiple declarators group into a single SimpleDeclaration.
-      {R"cpp(
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, MultipleDeclaratorsGrouping) {
+  expectTreeDumpEqual(
+      R"cpp(
       int *a, b;
   )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-int
@@ -536,11 +535,12 @@
   |-SimpleDeclarator
   | `-b
   `-;
-  )txt"},
-      {R"cpp(
+  )txt");
+  expectTreeDumpEqual(
+      R"cpp(
     typedef int *a, b;
   )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-typedef
@@ -552,15 +552,18 @@
   |-SimpleDeclarator
   | `-b
   `-;
-  )txt"},
-      // Multiple declarators inside a statement.
-      {R"cpp(
+  )txt");
+}
+
+TEST_F(SyntaxTreeTest, MultipleDeclaratorsInsideStatement) {
+  expectTreeDumpEqual(
+      R"cpp(
 void foo() {
       int *a, b;
       typedef int *ta, tb;
 }
   )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -593,15 +596,19 @@
     | |   `-tb
     | `-;
     `-}
-  )txt"},
-      {R"cpp(
+  )txt");
+}
+
+TEST_F(SyntaxTreeTest, Namespaces) {
+  expectTreeDumpEqual(
+      R"cpp(
 namespace a { namespace b {} }
 namespace a::b {}
 namespace {}
 
 namespace foo = a;
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-NamespaceDefinition
 | |-namespace
@@ -630,9 +637,62 @@
   |-=
   |-a
   `-;
-)txt"},
-      // Free-standing classes, must live inside a SimpleDeclaration.
-      {R"cpp(
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, UsingDirective) {
+  expectTreeDumpEqual(
+      R"cpp(
+namespace ns {}
+using namespace ::ns;
+    )cpp",
+      R"txt(
+*: TranslationUnit
+|-NamespaceDefinition
+| |-namespace
+| |-ns
+| |-{
+| `-}
+`-UsingNamespaceDirective
+  |-using
+  |-namespace
+  |-::
+  |-ns
+  `-;
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, UsingDeclaration) {
+  expectTreeDumpEqual(
+      R"cpp(
+namespace ns { int a; }
+using ns::a;
+    )cpp",
+      R"txt(
+*: TranslationUnit
+|-NamespaceDefinition
+| |-namespace
+| |-ns
+| |-{
+| |-SimpleDeclaration
+| | |-int
+| | |-SimpleDeclarator
+| | | `-a
+| | `-;
+| `-}
+`-UsingDeclaration
+  |-using
+  |-ns
+  |-::
+  |-a
+  `-;
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, FreeStandingClasses) {
+  // Free-standing classes, must live inside a SimpleDeclaration.
+  expectTreeDumpEqual(
+      R"cpp(
 sturct X;
 struct X {};
 
@@ -641,7 +701,7 @@
 
 struct {} *a1;
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-SimpleDeclaration
 | |-sturct
@@ -677,13 +737,17 @@
   | |-*
   | `-a1
   `-;
-)txt"},
-      {R"cpp(
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, Templates) {
+  expectTreeDumpEqual(
+      R"cpp(
 template <class T> struct cls {};
 template <class T> int var = 10;
 template <class T> int fun() {}
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-TemplateDeclaration
 | |-template
@@ -730,15 +794,19 @@
     `-CompoundStatement
       |-{
       `-}
-)txt"},
-      {R"cpp(
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, NestedTemplates) {
+  expectTreeDumpEqual(
+      R"cpp(
 template <class T>
 struct X {
   template <class U>
   U foo();
 };
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-TemplateDeclaration
   |-template
@@ -768,85 +836,16 @@
     |   `-;
     |-}
     `-;
-)txt"},
-      {R"cpp(
-template <class T> struct X {};
-template <class T> struct X<T*> {};
-template <> struct X<int> {};
+)txt");
+}
 
-template struct X<double>;
-extern template struct X<float>;
-)cpp",
-       R"txt(
-*: TranslationUnit
-|-TemplateDeclaration
-| |-template
-| |-<
-| |-UnknownDeclaration
-| | |-class
-| | `-T
-| |->
-| `-SimpleDeclaration
-|   |-struct
-|   |-X
-|   |-{
-|   |-}
-|   `-;
-|-TemplateDeclaration
-| |-template
-| |-<
-| |-UnknownDeclaration
-| | |-class
-| | `-T
-| |->
-| `-SimpleDeclaration
-|   |-struct
-|   |-X
-|   |-<
-|   |-T
-|   |-*
-|   |->
-|   |-{
-|   |-}
-|   `-;
-|-TemplateDeclaration
-| |-template
-| |-<
-| |->
-| `-SimpleDeclaration
-|   |-struct
-|   |-X
-|   |-<
-|   |-int
-|   |->
-|   |-{
-|   |-}
-|   `-;
-|-ExplicitTemplateInstantiation
-| |-template
-| `-SimpleDeclaration
-|   |-struct
-|   |-X
-|   |-<
-|   |-double
-|   |->
-|   `-;
-`-ExplicitTemplateInstantiation
-  |-extern
-  |-template
-  `-SimpleDeclaration
-    |-struct
-    |-X
-    |-<
-    |-float
-    |->
-    `-;
-)txt"},
-      {R"cpp(
+TEST_F(SyntaxTreeTest, Templates2) {
+  expectTreeDumpEqual(
+      R"cpp(
 template <class T> struct X { struct Y; };
 template <class T> struct X<T>::Y {};
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-TemplateDeclaration
 | |-template
@@ -883,55 +882,18 @@
     |-{
     |-}
     `-;
-       )txt"},
-      {R"cpp(
-namespace ns {}
-using namespace ::ns;
-    )cpp",
-       R"txt(
-*: TranslationUnit
-|-NamespaceDefinition
-| |-namespace
-| |-ns
-| |-{
-| `-}
-`-UsingNamespaceDirective
-  |-using
-  |-namespace
-  |-::
-  |-ns
-  `-;
-       )txt"},
-      {R"cpp(
-namespace ns { int a; }
-using ns::a;
-    )cpp",
-       R"txt(
-*: TranslationUnit
-|-NamespaceDefinition
-| |-namespace
-| |-ns
-| |-{
-| |-SimpleDeclaration
-| | |-int
-| | |-SimpleDeclarator
-| | | `-a
-| | `-;
-| `-}
-`-UsingDeclaration
-  |-using
-  |-ns
-  |-::
-  |-a
-  `-;
-       )txt"},
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, TemplatesUsingUsing) {
+  expectTreeDumpEqual(
+      R"cpp(
 template <class T> struct X {
   using T::foo;
   using typename T::bar;
 };
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-TemplateDeclaration
   |-template
@@ -959,11 +921,92 @@
     | `-;
     |-}
     `-;
-       )txt"},
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, ExplicitTemplateInstantations) {
+  expectTreeDumpEqual(
+      R"cpp(
+template <class T> struct X {};
+template <class T> struct X<T*> {};
+template <> struct X<int> {};
+
+template struct X<double>;
+extern template struct X<float>;
+)cpp",
+      R"txt(
+*: TranslationUnit
+|-TemplateDeclaration
+| |-template
+| |-<
+| |-UnknownDeclaration
+| | |-class
+| | `-T
+| |->
+| `-SimpleDeclaration
+|   |-struct
+|   |-X
+|   |-{
+|   |-}
+|   `-;
+|-TemplateDeclaration
+| |-template
+| |-<
+| |-UnknownDeclaration
+| | |-class
+| | `-T
+| |->
+| `-SimpleDeclaration
+|   |-struct
+|   |-X
+|   |-<
+|   |-T
+|   |-*
+|   |->
+|   |-{
+|   |-}
+|   `-;
+|-TemplateDeclaration
+| |-template
+| |-<
+| |->
+| `-SimpleDeclaration
+|   |-struct
+|   |-X
+|   |-<
+|   |-int
+|   |->
+|   |-{
+|   |-}
+|   `-;
+|-ExplicitTemplateInstantiation
+| |-template
+| `-SimpleDeclaration
+|   |-struct
+|   |-X
+|   |-<
+|   |-double
+|   |->
+|   `-;
+`-ExplicitTemplateInstantiation
+  |-extern
+  |-template
+  `-SimpleDeclaration
+    |-struct
+    |-X
+    |-<
+    |-float
+    |->
+    `-;
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, UsingType) {
+  expectTreeDumpEqual(
+      R"cpp(
 using type = int;
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-TypeAliasDeclaration
   |-using
@@ -971,20 +1014,28 @@
   |-=
   |-int
   `-;
-       )txt"},
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, EmptyDeclaration) {
+  expectTreeDumpEqual(
+      R"cpp(
 ;
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-EmptyDeclaration
   `-;
-       )txt"},
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, StaticAssert) {
+  expectTreeDumpEqual(
+      R"cpp(
 static_assert(true, "message");
 static_assert(true);
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-StaticAssertDeclaration
 | |-static_assert
@@ -1003,12 +1054,16 @@
   | `-true
   |-)
   `-;
-       )txt"},
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, ExternC) {
+  expectTreeDumpEqual(
+      R"cpp(
 extern "C" int a;
 extern "C" { int b; int c; }
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-LinkageSpecificationDeclaration
 | |-extern
@@ -1033,15 +1088,19 @@
   | | `-c
   | `-;
   `-}
-       )txt"},
-      // Some nodes are non-modifiable, they are marked with 'I:'.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, NonModifiableNodes) {
+  // Some nodes are non-modifiable, they are marked with 'I:'.
+  expectTreeDumpEqual(
+      R"cpp(
 #define HALF_IF if (1+
 #define HALF_IF_2 1) {}
 void test() {
   HALF_IF HALF_IF_2 else {}
 })cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -1068,9 +1127,10 @@
     |   |-{
     |   `-}
     `-}
-       )txt"},
-      // All nodes can be mutated.
-      {R"cpp(
+       )txt");
+  // All nodes can be mutated.
+  expectTreeDumpEqual(
+      R"cpp(
 #define OPEN {
 #define CLOSE }
 
@@ -1084,7 +1144,7 @@
   }
 }
 )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -1110,15 +1170,18 @@
     | | `-;
     | `-}
     `-}
-       )txt"},
-      // Array subscripts in declarators.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, ArraySubscriptsInDeclarators) {
+  expectTreeDumpEqual(
+      R"cpp(
 int a[10];
 int b[1][2][3];
 int c[] = {1,2,3};
 void f(int xs[static 10]);
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-SimpleDeclaration
 | |-int
@@ -1185,9 +1248,12 @@
   |   |     `-]
   |   `-)
   `-;
-       )txt"},
-      // Parameter lists in declarators.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, ParameterListsInDeclarators) {
+  expectTreeDumpEqual(
+      R"cpp(
 int a() const;
 int b() volatile;
 int c() &;
@@ -1202,7 +1268,7 @@
   int&& f
 );
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-SimpleDeclaration
 | |-int
@@ -1301,14 +1367,17 @@
   |   |   `-f
   |   `-)
   `-;
-       )txt"},
-      // Trailing const qualifier.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, TrailingConst) {
+  expectTreeDumpEqual(
+      R"cpp(
 struct X {
   int foo() const;
 }
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-struct
@@ -1324,12 +1393,15 @@
   | |   `-const
   | `-;
   `-}
-    )txt"},
-      // Trailing return type in parameter lists.
-      {R"cpp(
+    )txt");
+}
+
+TEST_F(SyntaxTreeTest, TrailingReturn) {
+  expectTreeDumpEqual(
+      R"cpp(
 auto foo() -> int;
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-auto
@@ -1342,14 +1414,17 @@
   |     |-->
   |     `-int
   `-;
-       )txt"},
-      // Exception specification in parameter lists.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, ExceptionSpecification) {
+  expectTreeDumpEqual(
+      R"cpp(
 int a() noexcept;
 int b() noexcept(true);
 int c() throw();
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-SimpleDeclaration
 | |-int
@@ -1384,15 +1459,18 @@
   |   |-(
   |   `-)
   `-;
-       )txt"},
-      // Declarators in parentheses.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, DeclaratorsInParentheses) {
+  expectTreeDumpEqual(
+      R"cpp(
 int (a);
 int *(b);
 int (*c)(int);
 int *(d)(int);
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-SimpleDeclaration
 | |-int
@@ -1439,15 +1517,18 @@
   |   | `-int
   |   `-)
   `-;
-       )txt"},
-      // CV qualifiers.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, ConstVolatileQualifiers) {
+  expectTreeDumpEqual(
+      R"cpp(
 const int west = -1;
 int const east = 1;
 const int const universal = 0;
 const int const *const *volatile b;
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-SimpleDeclaration
 | |-const
@@ -1489,12 +1570,15 @@
   | |-volatile
   | `-b
   `-;
-       )txt"},
-      // Ranges of declarators with trailing return types.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, RangesOfDeclaratorsWithTrailingReturnTypes) {
+  expectTreeDumpEqual(
+      R"cpp(
 auto foo() -> auto(*)(int) -> double*;
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-auto
@@ -1522,14 +1606,17 @@
   |           `-SimpleDeclarator
   |             `-*
   `-;
-       )txt"},
-      // Member pointers.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, MemberPointers) {
+  expectTreeDumpEqual(
+      R"cpp(
 struct X {};
 int X::* a;
 const int X::* b;
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 |-SimpleDeclaration
 | |-struct
@@ -1556,12 +1643,15 @@
   | | `-*
   | `-b
   `-;
-       )txt"},
-      // All-in-one tests.
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, ComplexDeclarator) {
+  expectTreeDumpEqual(
+      R"cpp(
 void x(char a, short (*b)(int));
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -1589,11 +1679,15 @@
   |   |     `-)
   |   `-)
   `-;
-       )txt"},
-      {R"cpp(
+       )txt");
+}
+
+TEST_F(SyntaxTreeTest, ComplexDeclarator2) {
+  expectTreeDumpEqual(
+      R"cpp(
 void x(char a, short (*b)(int), long (**c)(long long));
     )cpp",
-       R"txt(
+      R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
   |-void
@@ -1637,18 +1731,7 @@
   |   |     `-)
   |   `-)
   `-;
-       )txt"},
-  };
-
-  for (const auto &T : Cases) {
-    SCOPED_TRACE(T.first);
-
-    auto *Root = buildTree(T.first);
-    std::string Expected = llvm::StringRef(T.second).trim().str();
-    std::string Actual =
-        std::string(llvm::StringRef(Root->dump(*Arena)).trim());
-    EXPECT_EQ(Expected, Actual) << "the resulting dump is:\n" << Actual;
-  }
+       )txt");
 }
 
 TEST_F(SyntaxTreeTest, Mutations) {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1108,6 +1108,85 @@
     return CallGraphRoot;
   }
 
+  bool hasSimilarFunction(unsigned id) { return SimilarFunctions.count(id); }
+
+  void addToSimilarFunctions(unsigned id, GlobalValue::GUID GUID) {
+    if (id == 0)
+      return; // invalid id.
+    if (DuplicateFunctions.count(GUID))
+      return;
+    ValueInfo VI = getValueInfo(GUID);
+    if (!VI)
+      return;
+    // assert(VI.getSummaryList().size() == 1);
+    if (VI.getSummaryList().size() != 1)
+      return;
+    GlobalValueSummary *S = VI.getSummaryList()[0].get();
+    if (!isa<FunctionSummary>(S))
+      return;
+    assert(isa<FunctionSummary>(S) && "Not a function summary!");
+    if (FunctionSimilarityHashes.count(GUID)) {
+      // Erase the GUID having multiple visits in the ModuleSummaryIndex.
+      FunctionSimilarityHashes.erase(GUID);
+      DuplicateFunctions.insert(GUID);
+      return;
+    }
+
+    FunctionSimilarityHashes[GUID] = id;
+  }
+
+  void populateReverseSimilarityHashMap() {
+    for (auto &p : FunctionSimilarityHashes)
+      SimilarFunctions[p.second].push_back(p.first);
+  }
+
+  void removeSingleEntriesFromSimHashMaps() {
+    // Iterate over the hash to remove entries with no duplicates.
+    for (auto I = SimilarFunctions.begin(), E = SimilarFunctions.end();
+         I != E;) {
+      auto Next = std::next(I);
+      assert(I->second.size() && "Empty Entry!");
+      if (I->second.size() == 1) {
+        FunctionSimilarityHashes.erase(I->second[0]);
+        SimilarFunctions.erase(I);
+      }
+      I = Next;
+    }
+  }
+
+  std::map<unsigned, std::vector<GlobalValue::GUID>> &getSimilarFunctions() {
+    return SimilarFunctions;
+  }
+
+  const std::map<unsigned, std::vector<GlobalValue::GUID>> &
+  getSimilarFunctions() const {
+    return SimilarFunctions;
+  }
+
+  unsigned getSimilarityHash(GlobalValue::GUID ID) const {
+    return FunctionSimilarityHashes.find(ID)->second;
+  }
+
+  std::map<GlobalValue::GUID, unsigned> &getSimilarFunctionsHash() {
+    return FunctionSimilarityHashes;
+  }
+
+  const std::map<GlobalValue::GUID, unsigned> &getSimilarFunctionsHash() const {
+    return FunctionSimilarityHashes;
+  }
+
+  void addToHostSimilarFunction(GlobalValue::GUID ID) {
+    HostSimilarFunction.insert(ID);
+  }
+
+  std::set<GlobalValue::GUID> &getHostSimilarFunction() {
+    return HostSimilarFunction;
+  }
+
+  const std::set<GlobalValue::GUID> &getHostSimilarFunction() const {
+    return HostSimilarFunction;
+  }
+
   bool withGlobalValueDeadStripping() const {
     return WithGlobalValueDeadStripping;
   }
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -292,6 +292,8 @@
 void initializeMemorySSAWrapperPassPass(PassRegistry&);
 void initializeMemorySanitizerLegacyPassPass(PassRegistry&);
 void initializeMergeFunctionsLegacyPassPass(PassRegistry&);
+void initializeMergeFunctionsPass(PassRegistry&);
+void initializeMergeSimilarFunctionsPass(PassRegistry&);
 void initializeMergeICmpsLegacyPassPass(PassRegistry &);
 void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
 void initializeMetaRenamerPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -197,6 +197,7 @@
       (void) llvm::createPostOrderFunctionAttrsLegacyPass();
       (void) llvm::createReversePostOrderFunctionAttrsPass();
       (void) llvm::createMergeFunctionsPass();
+      (void) llvm::createMergeSimilarFunctionsPass();
       (void) llvm::createMergeICmpsLegacyPass();
       (void) llvm::createExpandMemCmpPass();
       std::string buf;
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -215,6 +215,13 @@
 /// function(s).
 ModulePass *createHotColdSplittingPass();
 
+//===----------------------------------------------------------------------===//
+/// createMergeSimilarFunctionsPass - This pass discovers similar functions and
+/// merges them.
+///
+ModulePass *
+createMergeSimilarFunctionsPass(const ModuleSummaryIndex *S = nullptr);
+
 //===----------------------------------------------------------------------===//
 /// createPartialInliningPass - This pass inlines parts of functions.
 ///
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -126,6 +126,20 @@
 Function *CloneFunction(Function *F, ValueToValueMapTy &VMap,
                         ClonedCodeInfo *CodeInfo = nullptr);
 
+/// Used to control @fn CloneFunctionInto.
+enum class CloneType {
+  InvalidCloneType,
+  /// Cloning will result in module level changes.
+  ModuleLevelChanges,
+  /// !ModuleLevelChanges, When no module level changes will be made to the
+  /// cloned function.
+  NoModuleLevelChanges,
+  /// Cloning will be used for extracting functions by passes like function
+  /// merging, it does not require module level changes but debug info needs
+  /// special treatment like: DISubprogram is not cloned.
+  ExtractingFunctions,
+};
+
 /// Clone OldFunc into NewFunc, transforming the old arguments into references
 /// to VMap values.  Note that if NewFunc already has basic blocks, the ones
 /// cloned into it will be added to the end of the function.  This function
@@ -136,7 +150,7 @@
 /// mappings.
 ///
 void CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
-                       ValueToValueMapTy &VMap, bool ModuleLevelChanges,
+                       ValueToValueMapTy &VMap, CloneType CT,
                        SmallVectorImpl<ReturnInst*> &Returns,
                        const char *NameSuffix = "",
                        ClonedCodeInfo *CodeInfo = nullptr,
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -76,6 +76,89 @@
     cl::value_desc("filename"),
     cl::desc("File to emit dot graph of new summary into."));
 
+cl::opt<bool> UseGlobalAliases(
+    "mergesimilarfunc-global-aliases", cl::Hidden, cl::init(false),
+    cl::desc("Enable writing alias by enabling global aliases"));
+
+cl::opt<unsigned> MergeMinInsts(
+    "mergesimilarfunc-min-insts", cl::Hidden, cl::init(4),
+    cl::desc("Min instructions required to even consider single block fns"));
+
+// Minimize the name pollution caused by the enum values.
+namespace Opt {
+cl::opt<enum MergeLevelEnum> MergeLevel(
+    "mergesimilarfunc-level", cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Level of function merging:"), cl::init(size),
+    cl::values(clEnumVal(none, "function merging disabled"),
+               clEnumVal(size, "only try to merge functions that are optimized "
+                               "for size"),
+               clEnumVal(all, "attempt to merge all similar functions")));
+}
+
+namespace llvm {
+
+static const char *MERGED_SUFFIX = "__merged";
+
+/// Returns the type id for a type to be hashed. We turn pointer types into
+/// integers here because the actual compare logic below considers pointers and
+/// integers of the same size as equal.
+static Type::TypeID getTypeIDForHash(Type *Ty) {
+  if (Ty->isPointerTy())
+    return Type::IntegerTyID;
+  return Ty->getTypeID();
+}
+
+bool isAliasCapable(const Function* G) {
+  return
+    UseGlobalAliases && G->hasGlobalUnnamedAddr()
+    && (G->hasExternalLinkage() || G->hasLocalLinkage() || G->hasWeakLinkage());
+}
+
+bool isComparisonCandidate(const Function *F) {
+  if (Opt::MergeLevel == Opt::size) {
+    // Only consider functions that are to be optimized for size.
+    // By default, that is all functions at -Os/-Oz and nothing at -O2.
+    bool Os = F->getAttributes().
+      hasAttribute(AttributeList::FunctionIndex, Attribute::OptimizeForSize);
+    bool Oz = F->getAttributes().
+      hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+    if (!Os && !Oz)
+      return false;
+  }
+
+  // Ignore declarations and tiny functions - no point in merging those
+  if (F->isDeclaration()) return false;
+  if (F->getName().endswith(MERGED_SUFFIX)) return false;
+  if (F->hasAvailableExternallyLinkage()) return false;
+  if (F->hasFnAttribute(Attribute::AlwaysInline)) return false;
+  if (F->size() == 1 && F->begin()->size() < MergeMinInsts)
+    return isAliasCapable(F);
+
+  return true;
+}
+
+unsigned profileFunction(const Function *F) {
+  FunctionType *FTy = F->getFunctionType();
+  if (!isComparisonCandidate(F))
+    return 0;
+  if (F->hasGC() || FTy->isVarArg() || !F->hasExactDefinition())
+    return 0;
+  FoldingSetNodeID ID;
+  ID.AddInteger(F->size());
+  ID.AddInteger(F->getCallingConv());
+  // Add pure attribute, has side-effects attribute.
+  ID.AddBoolean(F->hasFnAttribute(Attribute::NoUnwind));
+  ID.AddBoolean(F->hasFnAttribute(Attribute::NoReturn));
+  //ID.AddBoolean(F->hasGC());
+  //ID.AddBoolean(F->isInterposable());
+  //ID.AddBoolean(FTy->isVarArg());
+  ID.AddInteger(getTypeIDForHash(FTy->getReturnType()));
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    ID.AddInteger(getTypeIDForHash(FTy->getParamType(i)));
+  return ID.ComputeHash();
+}
+}
+
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
 // Instruction or a GlobalVariable (which walks its initializer).
@@ -467,8 +550,7 @@
       F.hasFnAttribute(Attribute::NoRecurse), F.returnDoesNotAlias(),
       // FIXME: refactor this to use the same code that inliner is using.
       // Don't try to import functions with noinline attribute.
-      F.getAttributes().hasFnAttribute(Attribute::NoInline),
-      F.hasFnAttribute(Attribute::AlwaysInline)};
+      F.getAttributes().hasFnAttribute(Attribute::NoInline)};
   auto FuncSummary = std::make_unique<FunctionSummary>(
       Flags, NumInsts, FunFlags, /*EntryCount=*/0, std::move(Refs),
       CallGraphEdges.takeVector(), TypeTests.takeVector(),
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -5785,11 +5785,9 @@
   }
   const uint64_t Version = Record[0];
   const bool IsOldProfileFormat = Version == 1;
-  if (Version < 1 || Version > ModuleSummaryIndex::BitcodeSummaryVersion)
+  if (Version < 1 || Version > 7)
     return error("Invalid summary version " + Twine(Version) +
-                 ". Version should be in the range [1-" +
-                 Twine(ModuleSummaryIndex::BitcodeSummaryVersion) +
-                 "].");
+                 ". Version should be in the range [1-7].");
   Record.clear();
 
   // Keep around the last seen summary to be used when we see an optional
@@ -5904,6 +5902,11 @@
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
           std::move(PendingTypeCheckedLoadConstVCalls));
+      PendingTypeTests.clear();
+      PendingTypeTestAssumeVCalls.clear();
+      PendingTypeCheckedLoadVCalls.clear();
+      PendingTypeTestAssumeConstVCalls.clear();
+      PendingTypeCheckedLoadConstVCalls.clear();
       auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID);
       FS->setModulePath(getThisModule()->first());
       FS->setOriginalName(VIAndOriginalGUID.second);
@@ -6046,6 +6049,11 @@
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
           std::move(PendingTypeCheckedLoadConstVCalls));
+      PendingTypeTests.clear();
+      PendingTypeTestAssumeVCalls.clear();
+      PendingTypeCheckedLoadVCalls.clear();
+      PendingTypeTestAssumeConstVCalls.clear();
+      PendingTypeCheckedLoadConstVCalls.clear();
       LastSeenSummary = FS.get();
       LastSeenGUID = VI.getGUID();
       FS->setModulePath(ModuleIdMap[ModuleId]);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -3737,6 +3737,11 @@
   NameVals.clear();
 }
 
+// Current version for the summary.
+// This is bumped whenever we introduce changes in the way some record are
+// interpreted, like flags for instance.
+static const uint64_t INDEX_VERSION = 7;
+
 /// Emit the per-module summary section alongside the rest of
 /// the module's bitcode.
 void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -318,7 +318,7 @@
          "modules.");
 
   SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
-  CloneFunctionInto(NewF, &OrigF, VMap, /*ModuleLevelChanges=*/true, Returns,
+  CloneFunctionInto(NewF, &OrigF, VMap, CloneType::ModuleLevelChanges, Returns,
                     "", nullptr, nullptr, Materializer);
   OrigF.deleteBody();
 }
diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
--- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -316,7 +316,7 @@
       }
     }
     SmallVector<ReturnInst*, 8> Returns;
-    CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+    CloneFunctionInto(NewF, F, VMap, CloneType::NoModuleLevelChanges, Returns);
 
     // Build new MDNode.
     SmallVector<Metadata *, 6> KernelMDArgs;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1141,8 +1141,8 @@
             .addReg(FrameReg);
         } else {
           if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
-            Register ScaledReg =
-              RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
+            // Reuse ResultReg in intermediate step.
+            Register ScaledReg = ResultReg;
 
             BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
                     ScaledReg)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -216,11 +216,16 @@
 
 // VGPR registers
 foreach Index = 0-255 in {
+  // Set a cost value for vgprs other than the argument registers (v0-v31).
+  // The ratio of index/allocation_granularity is taken as the cost value.
+  // Considered the allocation granularity as 4 here.
+  let CostPerUse=!if(!gt(Index, 31), !srl(Index, 2), 0) in {
   def VGPR#Index :
       SIReg <"v"#Index, Index>,
       DwarfRegNum<[!add(Index, 2560)]> {
     let HWEncoding{8} = 1;
   }
+  }
 }
 
 // AccVGPR registers
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -667,7 +667,7 @@
   auto savedLinkage = NewF->getLinkage();
   NewF->setLinkage(llvm::GlobalValue::ExternalLinkage);
 
-  CloneFunctionInto(NewF, &OrigF, VMap, /*ModuleLevelChanges=*/true, Returns);
+  CloneFunctionInto(NewF, &OrigF, VMap, CloneType::ModuleLevelChanges, Returns);
 
   NewF->setLinkage(savedLinkage);
   NewF->setVisibility(savedVisibility);
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -27,6 +27,7 @@
   LowerTypeTests.cpp
   MergeFunctions.cpp
   OpenMPOpt.cpp
+  MergeSimilarFunctions.cpp
   PartialInlining.cpp
   PassManagerBuilder.cpp
   PruneEH.cpp
diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp
--- a/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/llvm/lib/Transforms/IPO/IPO.cpp
@@ -45,6 +45,8 @@
   initializeSingleLoopExtractorPass(Registry);
   initializeLowerTypeTestsPass(Registry);
   initializeMergeFunctionsLegacyPassPass(Registry);
+  initializeMergeFunctionsPass(Registry);
+  initializeMergeSimilarFunctionsPass(Registry);
   initializePartialInlinerLegacyPassPass(Registry);
   initializeAttributorLegacyPassPass(Registry);
   initializeAttributorCGSCCLegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/IPO/MergeSimilarFunctions.cpp b/llvm/lib/Transforms/IPO/MergeSimilarFunctions.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/MergeSimilarFunctions.cpp
@@ -0,0 +1,2197 @@
+//===- MergeSimilarFunctions.cpp - Merge similar functions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass merges both equivalent and similar functions to reduce code size.
+//
+// For a more detailed explanation of the approach, see:
+// Edler von Koch et al. "Exploiting Function Similarity for Code Size
+// Reduction", LCTES 2014.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mergesimilarfunc"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <vector>
+#include <list>
+using namespace llvm;
+
+STATISTIC(NumFunctionsMerged, "Number of functions merged");
+STATISTIC(NumThunksWritten, "Number of thunks generated");
+STATISTIC(NumAliasesWritten, "Number of aliases generated");
+STATISTIC(NumDoubleWeak, "Number of new functions created");
+STATISTIC(NumMultiMerged, "Number of multi-merged functions");
+
+STATISTIC(NumSimilarFunctionsMerged, "Number of similar functions merged");
+
+static cl::opt<unsigned> MergeMinInsts(
+    "mergesimilarfunc-min-insts", cl::Hidden, cl::init(4),
+    cl::desc("Min instructions required to even consider single block fns"));
+
+static cl::opt<unsigned> MergeDifferingMinInsts(
+    "mergesimilarfunc-diff-min-insts", cl::Hidden, cl::init(15),
+    cl::desc("Min instructions required to try merging differing functions"));
+
+static cl::opt<unsigned> MergeMaxDiffering(
+    "mergesimilarfunc-max-diff", cl::Hidden, cl::init(8),
+    cl::desc("Maximum number of differing instructions allowed"));
+
+static cl::opt<unsigned> MergeMinSimilarity(
+    "mergesimilarfunc-min-similarity", cl::Hidden, cl::init(70),
+    cl::desc("Minimum percentage of similar instructions required"));
+
+static cl::opt<bool> OptPrintMerges("mergesimilarfunc-print-merges", cl::Hidden,
+                                    cl::init(false));
+
+static cl::opt<bool> UseGlobalAliases(
+    "mergesimilarfunc-global-aliases", cl::Hidden, cl::init(false),
+    cl::desc("Enable writing alias by enabling global aliases"));
+
+void PrintMerges(const char *Desc, Function *Old, Function *New) {
+  if (OptPrintMerges) {
+    dbgs() << "=== [" << Desc << "] replacing " << Old->getName() << " with "
+           << New->getName() << "\n";
+  }
+}
+
+// Minimize the name pollution caused by the enum values.
+namespace Opt {
+enum MergeLevelEnum { none, size, all };
+static cl::opt<enum MergeLevelEnum> MergeLevel(
+    "mergesimilarfunc-level", cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Level of function merging:"), cl::init(size),
+    cl::values(clEnumVal(none, "function merging disabled"),
+               clEnumVal(size, "only try to merge functions that are optimized "
+                               "for size"),
+               clEnumVal(all, "attempt to merge all similar functions")));
+}
+
+static const char *MERGED_SUFFIX = "__merged";
+
+/// Returns the type id for a type to be hashed. We turn pointer types into
+/// integers here because the actual compare logic below considers pointers and
+/// integers of the same size as equal.
+static Type::TypeID getTypeIDForHash(Type *Ty) {
+  if (Ty->isPointerTy())
+    return Type::IntegerTyID;
+  return Ty->getTypeID();
+}
+
+/// Creates a hash-code for the function which is the same for any two
+/// functions that will compare equal, without looking at the instructions
+/// inside the function.
+static unsigned profileFunction(const Function *F) {
+  FunctionType *FTy = F->getFunctionType();
+
+  FoldingSetNodeID ID;
+  ID.AddInteger(F->size());
+  ID.AddInteger(F->getCallingConv());
+  ID.AddBoolean(F->hasGC());
+  ID.AddBoolean(F->isInterposable());
+  ID.AddBoolean(FTy->isVarArg());
+  ID.AddInteger(getTypeIDForHash(FTy->getReturnType()));
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    ID.AddInteger(getTypeIDForHash(FTy->getParamType(i)));
+  return ID.ComputeHash();
+}
+
+
+/// Replace Inst1 by a switch statement that executes Inst1 or one of Inst2s
+/// depending on the value of SwitchVal. If a value in Inst2s is NULL, it
+/// defaults to executing Inst1. Returns set of terminator instructions of newly
+/// created switch blocks in Ret.
+///
+/// For instance, the transformation may look as follows:
+///         ...Head...
+///           Inst1           with all of Insts2s without parents
+///         ...Tail...
+///  into
+///         ...Head...
+///           Switch
+///         /     |       \                    .
+///    (default) (1)       (2)
+///      Inst1   Inst2s[0] Inst2s[1]
+///     Ret[0]   Ret[1]    Ret[2]
+///        \      |       /
+///         ...Tail...
+///
+static void SplitBlockAndInsertSwitch(
+    Value *SwitchVal, Instruction *Inst1,
+    SmallVectorImpl<Instruction *> &Inst2s,
+    SmallVectorImpl<Instruction *> &Ret) {
+  // Split block
+  BasicBlock *Head = Inst1->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(Inst1);
+
+  // Create default block
+  LLVMContext &C = Head->getContext();
+  BasicBlock *DefaultBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+
+  // Insert switch instruction at end of Head
+  Instruction *HeadOldTerm = Head->getTerminator();
+  SwitchInst *Switch = SwitchInst::Create(SwitchVal, DefaultBlock,
+                                               Inst2s.size());
+  ReplaceInstWithInst(HeadOldTerm, Switch);
+
+  // Move instructions into the blocks
+  if (Inst1->isTerminator()) {
+    Inst1->removeFromParent();
+    DefaultBlock->getInstList().push_back(Inst1);
+    Ret.push_back(cast<Instruction>(Inst1));
+  } else {
+    Instruction *DefaultTerm = BranchInst::Create(Tail, DefaultBlock);
+    Inst1->moveBefore(DefaultTerm);
+    Ret.push_back(DefaultTerm);
+  }
+
+  for (unsigned InstPos = 0, InstNum = Inst2s.size(); InstPos < InstNum;
+       ++InstPos) {
+    Instruction *Inst2 = Inst2s[InstPos];
+    if (!Inst2) {
+      Ret.push_back(NULL);
+      continue;
+    }
+
+    BasicBlock *CaseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+
+    // Update the debug information of the merged instruction by marking it as
+    // 'inlined' at this location. If only Inst1 or Inst2 has debug
+    // information, we try to do something sensible that won't break the
+    // verifier.
+    if (Inst1->getDebugLoc()) {
+      if (Inst2->getDebugLoc()) {
+        const DebugLoc &I2Loc = Inst2->getDebugLoc();
+        Inst2->setDebugLoc(
+            DebugLoc::get(I2Loc.getLine(), I2Loc.getCol(), I2Loc.getScope(),
+                          /*InlinedAt*/ Inst1->getDebugLoc().getAsMDNode()));
+      } else {
+        Inst2->setDebugLoc(Inst1->getDebugLoc());
+      }
+    } else if (Inst2->getDebugLoc()) {
+      Inst2->setDebugLoc(DebugLoc());
+    }
+
+    if (Inst2->isTerminator()) {
+      assert(Inst1->isTerminator() &&
+        "Inst1 and Inst2 must both be terminators or non-terminators!");
+      CaseBlock->getInstList().push_back(Inst2);
+      Ret.push_back(cast<Instruction>(Inst2));
+    } else {
+      Instruction *CaseTerm = BranchInst::Create(Tail, CaseBlock);
+      Inst2->insertBefore(CaseTerm);
+      Ret.push_back(CaseTerm);
+    }
+
+    Switch->addCase(ConstantInt::get(cast<IntegerType>(SwitchVal->getType()),
+                                     InstPos+1),
+                    CaseBlock);
+  }
+
+  // If Inst1 (and Inst2s) are Terminator Inst's, Tail will be empty and can be
+  // deleted now. We also need to update PHI nodes to add the additional
+  // incoming blocks from the SwitchInst.
+  if (Inst1->isTerminator()) {
+    for (succ_iterator I = succ_begin(DefaultBlock), E = succ_end(DefaultBlock);
+         I != E; ++I) {
+      BasicBlock *Successor = *I;
+      PHINode *Phi;
+
+      for (BasicBlock::iterator II = Successor->begin();
+           (Phi = dyn_cast<PHINode>(II)); ++II)
+        for (unsigned ValId = 0, ValEnd = Phi->getNumIncomingValues();
+             ValId != ValEnd; ++ValId)
+          if (Phi->getIncomingBlock(ValId) == Tail) {
+            Phi->setIncomingBlock(ValId, DefaultBlock);
+            SmallVectorImpl<Instruction *>::iterator
+              SwitchI = Ret.begin(), SwitchE = Ret.end();
+            for (++SwitchI; SwitchI != SwitchE; ++SwitchI) {
+              if (!*SwitchI)
+                continue;
+              Phi->addIncoming(Phi->getIncomingValue(ValId),
+                               (*SwitchI)->getParent());
+            }
+          }
+    }
+
+    Tail->eraseFromParent();
+  }
+}
+
+/// Insert function NewF into module, placing it immediately after the
+/// existing function PredF. If PredF does not exist, insert at the end.
+static void insertFunctionAfter(Function *NewF, Function *PredF) {
+  Module *M = PredF->getParent();
+  Module::FunctionListType &FList = M->getFunctionList();
+
+  for (Module::FunctionListType::iterator I = FList.begin(), E = FList.end();
+      I != E; ++I) {
+    if (PredF == &*I) {
+      FList.insertAfter(I, NewF);
+      return;
+    }
+  }
+
+  // Couldn't find PredF, insert at end
+  FList.push_back(NewF);
+}
+
+/// Create a cast instruction if needed to cast V to type DstType. We treat
+/// pointer and integer types of the same bitwidth as equivalent, so this can be
+/// used to cast them to each other where needed. The function returns the Value
+/// itself if no cast is needed, or a new CastInst instance inserted before
+/// InsertBefore. The integer type equivalent to pointers must be passed as
+/// IntPtrType (get it from DataLayout). This is guaranteed to generate no-op
+/// casts, otherwise it will assert.
+static Value *createCastIfNeeded(Value *V, Type *DstType,
+                                 Value *InstrOrBB, Type *IntPtrType, const DataLayout *DL) {
+  if (V->getType() == DstType)
+    return V;
+
+  BasicBlock *InsertAtEnd = dyn_cast<BasicBlock>(InstrOrBB);
+  Instruction *InsertBefore = dyn_cast<Instruction>(InstrOrBB);
+  BasicBlock *InsertBB = InsertAtEnd ? InsertAtEnd : InsertBefore->getParent();
+
+  CastInst *Result;
+  Type *OrigType = V->getType();
+
+  if (OrigType->isStructTy()) {
+    assert(DstType->isStructTy());
+    assert(OrigType->getStructNumElements() == DstType->getStructNumElements());
+
+    IRBuilder<> Builder(InsertBB);
+    if (InsertBefore)
+      Builder.SetInsertPoint(InsertBefore);
+    Value *Result = UndefValue::get(DstType);
+    for (unsigned int I = 0, E = OrigType->getStructNumElements(); I < E; ++I) {
+      Value *ExtractedValue
+        = Builder.CreateExtractValue(V, ArrayRef<unsigned int>(I));
+      Value *Element = createCastIfNeeded(ExtractedValue,
+                                          DstType->getStructElementType(I),
+                                          InstrOrBB, IntPtrType, DL);
+      Result =
+          Builder.CreateInsertValue(Result, Element, ArrayRef<unsigned int>(I));
+    }
+    return Result;
+  }
+  assert(!DstType->isStructTy());
+
+  if (OrigType->isPointerTy()
+      && (DstType->isIntegerTy() || DstType->isPointerTy())) {
+    if (InsertBefore)
+      Result = CastInst::CreatePointerCast(V, DstType, "", InsertBefore);
+    else
+      Result = CastInst::CreatePointerCast(V, DstType, "", InsertAtEnd);
+  } else if (OrigType->isIntegerTy() && DstType->isPointerTy()
+             && OrigType == IntPtrType) {
+    // Int -> Ptr
+    if (InsertBefore) {
+      Result = CastInst::Create(CastInst::IntToPtr, V, DstType, "",
+                                InsertBefore);
+    } else {
+      Result = CastInst::Create(CastInst::IntToPtr, V, DstType, "",
+                                InsertAtEnd);
+    }
+  } else {
+    llvm_unreachable("Can only cast int -> ptr or ptr -> (ptr or int)");
+  }
+
+  assert(cast<CastInst>(Result)->isNoopCast(*DL) &&
+      "Cast is not a no-op cast. Potential loss of precision");
+
+  return Result;
+}
+
+namespace {
+
+/// ComparableFunction - A struct that pairs together functions with a
+/// DataLayout so that we can keep them together as elements in the DenseSet.
+class ComparableFunction {
+public:
+  ComparableFunction() : Func(0), IsNew(false) { }
+
+  ComparableFunction(const ComparableFunction &that)
+    : Func(that.Func), IsNew(that.IsNew) {
+  }
+
+  ComparableFunction(Function *Func) : Func(Func), IsNew(true) { }
+
+  ~ComparableFunction() { }
+
+  ComparableFunction &operator=(const ComparableFunction &that) {
+    Func = that.Func;
+    IsNew = that.IsNew;
+    return *this;
+  }
+
+  Function *getFunc() const { return Func; }
+  bool isNew() const { return IsNew; }
+
+  // Drops AssertingVH reference to the function. Outside of debug mode, this
+  // does nothing.
+  void release() {
+    assert(Func &&
+           "Attempted to release function twice, or release empty/tombstone!");
+    Func = NULL;
+  }
+
+  void markCompared() {
+    IsNew = false;
+  }
+private:
+  AssertingVH<Function> Func;
+  bool IsNew;
+};
+
+}
+
+namespace {
+
+/// FunctionComparator - Compares two functions to determine whether or not
+/// they will generate machine code with the same behaviour. DataLayout is
+/// used if available. The comparator always fails conservatively (erring on the
+/// side of claiming that two functions are different).
+class FunctionComparator {
+public:
+  FunctionComparator(const DataLayout *DL, Function *F1, Function *F2)
+    : isDifferent(false), isNotMergeable(false),
+      BasicBlockCount(0), InstructionCount(0), DifferingInstructionsCount(0),
+      F1(F1), F2(F2), SimilarityMetric(0), DL(DL), ID(CurID++) {}
+
+  ~FunctionComparator() {}
+
+  /// Test whether the two functions have equivalent behaviour. Returns true if
+  /// they are equal or can be merged, false if not.
+  bool compare();
+
+  /// Indicate whether the two functions are an exact match after comparison
+  bool isExactMatch();
+
+  /// Indicate whether the two functions candidates for merging after comparison
+  bool isMergeCandidate();
+
+  /// Get a similarity metric between the two functions. Higher means more
+  /// similar.
+  unsigned getSimilarityMetric() {
+    if (!SimilarityMetric)
+      SimilarityMetric = (unsigned)(((float)InstructionCount -
+            DifferingInstructionsCount)/InstructionCount*10000);
+    return SimilarityMetric;
+  }
+
+  Function *getF1() { return F1; }
+  Function *getF2() { return F2; }
+  ValueToValueMapTy &getF1toF2Map() { return id_map; }
+  ValueToValueMapTy &getF2toF1Map() { return seen_values; }
+  const DataLayout *getDataLayout() { return DL; }
+
+  /// Assign or look up previously assigned numbers for the two values, and
+  /// return whether the numbers are equal. Numbers are assigned in the order
+  /// visited. If NoSelfRef is set, F1 and F2 are not assigned to each other
+  /// (treated as 'equal').
+  bool enumerate(const Value *V1, const Value *V2, bool NoSelfRef=false);
+
+  /// Compare two Types, treating all pointer types as equal.
+  bool isEquivalentType(Type *Ty1, Type *Ty2) const;
+
+  /// Instructions that differ between the two functions (F1's -> F2's inst).
+  MapVector<const Instruction *, const Instruction *> DifferingInstructions;
+
+  /// Instructions that reference F1/F2 itself (recursive calls etc.)
+  /// These may need special treatment when merging differing functions.
+  MapVector<const Instruction *, const Instruction *> SelfRefInstructions;
+
+  /// Return the unique ID for the object.
+  unsigned getID() { return ID; }
+
+  bool isDifferent;
+  bool isNotMergeable;
+
+  // Comparison statistics
+  unsigned BasicBlockCount;
+  unsigned InstructionCount;
+  unsigned DifferingInstructionsCount;
+
+private:
+  /// Test whether two basic blocks have equivalent behaviour. Returns true if
+  /// they are equal or can be merged, false if not. PHINodes are not compared
+  /// in this function, but added to the PHIsFound list for delayed processing.
+  bool compare(const BasicBlock *BB1, const BasicBlock *BB2,
+      std::list<std::pair<const PHINode*,const PHINode*> > *PHIsFound);
+
+  /// Compare pairs of PHI nodes. Returns true if all pairs are equal or can
+  /// be merged, false if not.
+  bool comparePHIs(
+      const std::list<std::pair<const PHINode*,const PHINode*> > &PHIs);
+
+  /// Compare two Instructions for equivalence, similar to
+  /// Instruction::isSameOperationAs but with modifications to the type
+  /// comparison.
+  bool isEquivalentOperation(const Instruction *I1,
+                             const Instruction *I2) const;
+
+  /// Compare two GEPs for equivalent pointer arithmetic.
+  bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2);
+  bool isEquivalentGEP(const GetElementPtrInst *GEP1,
+                       const GetElementPtrInst *GEP2) {
+    return isEquivalentGEP(cast<GEPOperator>(GEP1), cast<GEPOperator>(GEP2));
+  }
+
+  // The two functions undergoing comparison.
+  Function *F1, *F2;
+
+  unsigned SimilarityMetric;
+
+  const DataLayout *DL;
+
+  ValueToValueMapTy id_map;
+  ValueToValueMapTy seen_values;
+
+  // Maintain a unique ID for each object.
+  static unsigned CurID;
+  unsigned ID;
+};
+
+}
+
+unsigned FunctionComparator::CurID = 0;
+
+// Any two pointers in the same address space are equivalent, intptr_t and
+// pointers are equivalent. Otherwise, standard type equivalence rules apply.
+bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
+  if (Ty1 == Ty2)
+    return true;
+  if (Ty1->getTypeID() != Ty2->getTypeID()) {
+    LLVMContext &Ctx = Ty1->getContext();
+    if (isa<PointerType>(Ty1) && Ty2 == DL->getIntPtrType(Ctx)) return true;
+    if (isa<PointerType>(Ty2) && Ty1 == DL->getIntPtrType(Ctx)) return true;
+    return false;
+  }
+
+  switch (Ty1->getTypeID()) {
+  default:
+    llvm_unreachable("Unknown type!");
+    // Fall through in Release mode.
+  case Type::IntegerTyID:
+  case Type::VectorTyID:
+    // Ty1 == Ty2 would have returned true earlier.
+    return false;
+
+  case Type::VoidTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+  case Type::MetadataTyID:
+    return true;
+
+  case Type::PointerTyID: {
+    PointerType *PTy1 = cast<PointerType>(Ty1);
+    PointerType *PTy2 = cast<PointerType>(Ty2);
+    return PTy1->getAddressSpace() == PTy2->getAddressSpace();
+  }
+
+  case Type::StructTyID: {
+    StructType *STy1 = cast<StructType>(Ty1);
+    StructType *STy2 = cast<StructType>(Ty2);
+    if (STy1->getNumElements() != STy2->getNumElements())
+      return false;
+
+    if (STy1->isPacked() != STy2->isPacked())
+      return false;
+
+    for (unsigned i = 0, e = STy1->getNumElements(); i != e; ++i) {
+      if (!isEquivalentType(STy1->getElementType(i), STy2->getElementType(i)))
+        return false;
+    }
+    return true;
+  }
+
+  case Type::FunctionTyID: {
+    FunctionType *FTy1 = cast<FunctionType>(Ty1);
+    FunctionType *FTy2 = cast<FunctionType>(Ty2);
+    if (FTy1->getNumParams() != FTy2->getNumParams() ||
+        FTy1->isVarArg() != FTy2->isVarArg())
+      return false;
+
+    if (!isEquivalentType(FTy1->getReturnType(), FTy2->getReturnType()))
+      return false;
+
+    for (unsigned i = 0, e = FTy1->getNumParams(); i != e; ++i) {
+      if (!isEquivalentType(FTy1->getParamType(i), FTy2->getParamType(i)))
+        return false;
+    }
+    return true;
+  }
+
+  case Type::ArrayTyID: {
+    ArrayType *ATy1 = cast<ArrayType>(Ty1);
+    ArrayType *ATy2 = cast<ArrayType>(Ty2);
+    return ATy1->getNumElements() == ATy2->getNumElements() &&
+           isEquivalentType(ATy1->getElementType(), ATy2->getElementType());
+  }
+  }
+}
+
+// Determine whether the two operations are the same except that pointer-to-A
+// and pointer-to-B are equivalent. This should be kept in sync with
+// Instruction::isSameOperationAs.
+bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
+                                               const Instruction *I2) const {
+  // Differences from Instruction::isSameOperationAs:
+  //  * replace type comparison with calls to isEquivalentType.
+  //  * we test for I->hasSameSubclassOptionalData (nuw/nsw/tail) at the top
+  //  * because of the above, we don't test for the tail bit on calls later on
+  if (I1->getOpcode() != I2->getOpcode() ||
+      I1->getNumOperands() != I2->getNumOperands() ||
+      !isEquivalentType(I1->getType(), I2->getType()) ||
+      !I1->hasSameSubclassOptionalData(I2))
+    return false;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same type
+  for (unsigned i = 0, e = I1->getNumOperands(); i != e; ++i)
+    if (!isEquivalentType(I1->getOperand(i)->getType(),
+                          I2->getOperand(i)->getType()))
+      return false;
+
+  // Check special state that is a part of some instructions.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I1)) {
+    const LoadInst *LI2 = cast<LoadInst>(I2);
+    return LI->isVolatile() == LI2->isVolatile() &&
+           LI->getAlignment() == LI2->getAlignment() &&
+           LI->getOrdering() == LI2->getOrdering() &&
+           LI->getSyncScopeID() == LI2->getSyncScopeID() &&
+           LI->getMetadata(LLVMContext::MD_range)
+             == LI2->getMetadata(LLVMContext::MD_range);
+  }
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
+    return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
+           SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() &&
+           SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
+           SI->getSyncScopeID() == cast<StoreInst>(I2)->getSyncScopeID();
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(I1)) {
+    if (AI->getArraySize() != cast<AllocaInst>(I2)->getArraySize() ||
+        AI->getAlignment() != cast<AllocaInst>(I2)->getAlignment())
+      return false;
+
+    // If size is known, I2 can be seen as equivalent to I1 if it allocates
+    // the same or less memory.
+    if (DL->getTypeAllocSize(AI->getAllocatedType())
+          < DL->getTypeAllocSize(cast<AllocaInst>(I2)->getAllocatedType()))
+      return false;
+
+    return true;
+  }
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
+    return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(I1))
+    return CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() &&
+           CI->getAttributes() == cast<CallInst>(I2)->getAttributes();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
+    return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
+           CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1))
+    return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices();
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1))
+    return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(I1))
+    return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() &&
+           FI->getSyncScopeID() == cast<FenceInst>(I2)->getSyncScopeID();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1)) {
+    const AtomicCmpXchgInst *CXI2 = cast<AtomicCmpXchgInst>(I2);
+    return CXI->isVolatile() == CXI2->isVolatile() &&
+           CXI->isWeak() == CXI2->isWeak() &&
+           CXI->getSuccessOrdering() == CXI2->getSuccessOrdering() &&
+           CXI->getFailureOrdering() == CXI2->getFailureOrdering() &&
+           CXI->getSyncScopeID() == CXI2->getSyncScopeID();
+  }
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
+           RMWI->getSyncScopeID() == cast<AtomicRMWInst>(I2)->getSyncScopeID();
+
+  return true;
+}
+
+// Determine whether two GEP operations perform the same underlying arithmetic.
+bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
+                                         const GEPOperator *GEP2) {
+  // When we have target data, we can reduce the GEP down to the value in bytes
+  // added to the address.
+  if (GEP1->hasAllConstantIndices() && GEP2->hasAllConstantIndices()) {
+    SmallVector<Value *, 8> Indices1(GEP1->idx_begin(), GEP1->idx_end());
+    SmallVector<Value *, 8> Indices2(GEP2->idx_begin(), GEP2->idx_end());
+    uint64_t Offset1 = DL->getIndexedOffsetInType(GEP1->getSourceElementType(),
+                                            Indices1);
+    uint64_t Offset2 = DL->getIndexedOffsetInType(GEP2->getSourceElementType(),
+                                            Indices2);
+    return Offset1 == Offset2;
+  }
+
+  if (GEP1->getPointerOperand()->getType() !=
+      GEP2->getPointerOperand()->getType())
+    return false;
+
+  if (GEP1->getNumOperands() != GEP2->getNumOperands())
+    return false;
+
+  for (unsigned i = 0, e = GEP1->getNumOperands(); i != e; ++i) {
+    if (!enumerate(GEP1->getOperand(i), GEP2->getOperand(i)))
+      return false;
+  }
+
+  return true;
+}
+
+// Compare two values used by the two functions under pair-wise comparison. If
+// this is the first time the values are seen, they're added to the mapping so
+// that we will detect mismatches on next use.
+bool FunctionComparator::enumerate(const Value *V1, const Value *V2,
+    bool NoSelfRef/*=false*/) {
+  // Check for function @f1 referring to itself and function @f2 referring to
+  // itself. For compatibility with llvm's MergeFunctions, disallow referring to
+  // each other, or both referring to either of them.
+  if (!NoSelfRef && V1 == F1 && V2 == F2)
+    return true;
+
+  // FIXME: This is very conservative for now, but keeping this for thinlto.
+  if (isa<GlobalVariable>(V1) || isa<GlobalVariable>(V2))
+    return false;
+  if (const Constant *C1 = dyn_cast<Constant>(V1)) {
+    if (V1 == V2) return true;
+    const Constant *C2 = dyn_cast<Constant>(V2);
+    if (!C2) return false;
+    // TODO: constant expressions with GEP or references to F1 or F2.
+    if (C1->isNullValue() && C2->isNullValue() &&
+        isEquivalentType(C1->getType(), C2->getType()))
+      return true;
+    // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1
+    // then they must have equal bit patterns. Aggregate types cannot be
+    // bitcast.
+    if (C1->getType()->isAggregateType() || C2->getType()->isAggregateType())
+      return false;
+    return C1->getType()->canLosslesslyBitCastTo(C2->getType()) &&
+      C1 == ConstantExpr::getBitCast(const_cast<Constant*>(C2), C1->getType());
+  }
+
+  if (isa<InlineAsm>(V1) || isa<InlineAsm>(V2))
+    return V1 == V2;
+
+  // Check that V1 maps to V2. If we find a value that V1 maps to then we simply
+  // check whether it's equal to V2. When there is no mapping then we need to
+  // ensure that V2 isn't already equivalent to something else. For this
+  // purpose, we track the V2 values in a set.
+
+  ValueToValueMapTy::iterator I = id_map.find(V1);
+  if (I != id_map.end())
+    return V2 == I->second;
+  // FIXME: Const casts!!!
+  if (!seen_values.insert(std::make_pair(V2, const_cast<Value *>(V1))).second)
+    return false;
+  id_map[V1] = const_cast<Value *>(V2);
+  return true;
+}
+
+/// Test whether two basic blocks have equivalent behaviour. Returns true if the
+/// blocks can be merged, false if they cannot. Differing instructions are
+/// recorded in DifferingInstructions.
+bool FunctionComparator::compare(const BasicBlock *BB1, const BasicBlock *BB2,
+    std::list<std::pair<const PHINode*,const PHINode*> > *PHIsFound) {
+  BasicBlock::const_iterator F1I, F1E, F2I, F2E;
+
+  for (F1I = BB1->begin(), F1E = BB1->end(),
+       F2I = BB2->begin(), F2E = BB2->end();
+       F1I != F1E && F2I != F2E; ++F1I, ++F2I) {
+    // Skip debug information
+    const CallInst *DbgCall;
+    while (F1I != F1E && (DbgCall = dyn_cast<CallInst>(F1I)) &&
+           DbgCall->getCalledFunction() &&
+           DbgCall->getCalledFunction()->hasName() &&
+           DbgCall->getCalledFunction()->getName().startswith("llvm.dbg."))
+      ++F1I;
+
+    while (F2I != F2E && (DbgCall = dyn_cast<CallInst>(F2I)) &&
+           DbgCall->getCalledFunction() &&
+           DbgCall->getCalledFunction()->hasName() &&
+           DbgCall->getCalledFunction()->getName().startswith("llvm.dbg."))
+      ++F2I;
+
+    if (F1I == F1E || F2I == F2E)
+      break;
+
+    // Ok, we're dealing with real instructions. Check a few cases that will
+    // prevent merging first.
+    const Instruction *F1In = &*F1I;
+    const Instruction *F2In = &*F2I;
+
+    // Cannot merge insts that differ in whether they have uses
+    if (F1In->use_empty() != F2In->use_empty()) {
+      // TODO: Could implement merging for this case (would need to introduce a
+      // dummy value in the PHI node etc.)
+      return false;
+    }
+
+    // Cannot merge insts whose types are non-equivalent
+    if (!isEquivalentType(F1In->getType(), F2In->getType())) {
+      return false;
+    }
+
+    // TODO:  Currently cannot merge InvokeInsts with differing result types
+    //        that have uses. We cannot push up a bitcast into their block after
+    //        them because they are terminators. Would need to insert an
+    //        additional BB.
+    if (isa<InvokeInst>(F1In) && !F1In->use_empty() &&
+        F1In->getType() != F2In->getType())
+      return false;
+
+    if (!enumerate(F1In, F2In))
+      goto differing_instructions;
+
+    if (const GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(F1In)) {
+      const GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(F2In);
+      if (!GEP2)
+        goto differing_instructions;
+
+      if (!enumerate(GEP1->getPointerOperand(), GEP2->getPointerOperand()))
+        goto differing_instructions;
+
+      if (!isEquivalentGEP(GEP1, GEP2))
+        goto differing_instructions;
+    } else if (const PHINode *Phi1 = dyn_cast<PHINode>(F1In)) {
+      const PHINode *Phi2 = dyn_cast<PHINode>(F2In);
+      // We can't currently merge a PHI and non-PHI instruction
+      if (!Phi2)
+        return false;
+
+      // We can't currently merge PHI nodes with different numbers of incoming
+      // values
+      if (F1In->getNumOperands() != F2In->getNumOperands())
+        return false;
+
+      // We need to treat PHI nodes specially. Their incoming values may be in a
+      // different order even if they are equivalent. We can't compare them
+      // until we've seen the incoming blocks and know which values are
+      // equivalent. Therefore postpone PHINode comparison until the end.
+      PHIsFound->push_back(std::make_pair(Phi1, Phi2));
+    } else {
+      if (!isEquivalentOperation(F1In, F2In))
+        goto differing_instructions;
+
+      bool IsCall = isa<CallInst>(F1In);
+      assert(F1In->getNumOperands() == F2In->getNumOperands());
+      for (unsigned i = 0, e = F1In->getNumOperands(); i != e; ++i) {
+        Value *OpF1 = F1In->getOperand(i);
+        Value *OpF2 = F2In->getOperand(i);
+
+        // Allow self-reference if this is a call instruction and the last
+        // operand which is the called function
+        bool AllowSelfRef = IsCall && (i + 1) == e;
+
+        if (!enumerate(OpF1, OpF2, !AllowSelfRef))
+          goto differing_instructions;
+
+        if (!isEquivalentType(OpF1->getType(), OpF2->getType()))
+          goto differing_instructions;
+
+        if ((OpF1 == F1 && OpF2 == F2) || (OpF1 == F2 && OpF2 == F1))
+          SelfRefInstructions[F1In] = F2In;
+      }
+    }
+
+    continue;
+
+differing_instructions:
+    // Cannot merge functions with differing landing pad instructions yet. They
+    // would need special treatment which involves updating the corresponding
+    // invoke instructions.
+    if (isa<LandingPadInst>(F1In))
+      return false;
+    if (isa<InvokeInst>(F1In))
+      return false;
+
+    DifferingInstructions[F1In] = F2In;
+  }
+
+  // We cannot currently merge basic blocks with different instruction counts
+  return F1I == F1E && F2I == F2E;
+}
+
+bool FunctionComparator::comparePHIs(
+  const std::list<std::pair<const PHINode*,const PHINode*> > &PHIs) {
+  if (PHIs.empty())
+    return true;
+
+  for (std::list<std::pair<const PHINode*,const PHINode*> >::const_iterator
+         I = PHIs.begin(), E = PHIs.end(); I != E; ++I) {
+    const PHINode *Phi1 = I->first, *Phi2 = I->second;
+
+    for (unsigned ValId = 0, ValNum = Phi1->getNumIncomingValues();
+         ValId < ValNum; ++ValId) {
+      Value *Phi1Val = Phi1->getIncomingValue(ValId);
+
+      // Get corresponding Phi2Val
+      Value *BBinPhi2Val = getF1toF2Map()[Phi1->getIncomingBlock(ValId)];
+
+      if (!BBinPhi2Val)
+        return false; // Currently can't handle differing predecessor blocks
+
+      BasicBlock *BBinPhi2 = cast<BasicBlock>(BBinPhi2Val);
+      Value *Phi2Val = Phi2->getIncomingValueForBlock(BBinPhi2);
+
+      // Enumerate the values. If the PHI node references the function itself (a
+      // very rare case), we mark it as different (NoSelfRef). This is only
+      // necessary for outline merging, not equiv merging. TODO: Make equal
+      // merging possible with such PHI nodes.
+      if (!enumerate(Phi1Val, Phi2Val,/*NoSelfRef=*/true)) {
+        DifferingInstructions[Phi1] = Phi2;
+        break;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Test whether the two functions have equivalent behaviour.
+bool FunctionComparator::compare() {
+  // We need to recheck everything, but check the things that weren't included
+  // in the hash first.
+  if (F1->getAttributes() != F2->getAttributes())
+    goto not_mergeable;
+
+  if (F1->hasGC() != F2->hasGC())
+    goto not_mergeable;
+
+  if (F1->hasGC() && F1->getGC() != F2->getGC())
+    goto not_mergeable;
+
+  if (!F1->getSection().equals(F2->getSection()))
+    goto not_mergeable;
+
+  if (F1->isVarArg() != F2->isVarArg())
+    goto not_mergeable;
+
+  if (F1->isInterposable() != F2->isInterposable())
+    goto not_mergeable;
+
+  if (F1->size() != F2->size())
+    goto not_mergeable;
+
+  // TODO: if it's internal and only used in direct calls, we could handle
+  // this case too.
+  if (F1->getCallingConv() != F2->getCallingConv())
+    goto not_mergeable;
+
+  if (!isEquivalentType(F1->getFunctionType(), F2->getFunctionType()))
+    goto not_mergeable;
+
+  assert(F1->arg_size() == F2->arg_size() &&
+         "Identically typed functions have different numbers of args!");
+
+  // Visit the arguments so that they get enumerated in the order they're
+  // passed in.
+  for (Function::const_arg_iterator f1i = F1->arg_begin(),
+         f2i = F2->arg_begin(), f1e = F1->arg_end(); f1i != f1e; ++f1i, ++f2i) {
+    if (!enumerate(&*f1i, &*f2i))
+      llvm_unreachable("Arguments repeat!");
+  }
+
+  // We do a CFG-ordered walk since the actual ordering of the blocks in the
+  // linked list is immaterial. Our walk starts at the entry block for both
+  // functions, then takes each block from each terminator in order. As an
+  // artifact, this also means that unreachable blocks are ignored.
+  {
+    SmallVector<const BasicBlock *, 8> F1BBs, F2BBs;
+    SmallSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1.
+    std::list<std::pair<const PHINode*,const PHINode*> > PHIsFound;
+
+    F1BBs.push_back(&F1->getEntryBlock());
+    F2BBs.push_back(&F2->getEntryBlock());
+
+    VisitedBBs.insert(F1BBs[0]);
+    while (!F1BBs.empty()) {
+      const BasicBlock *F1BB = F1BBs.pop_back_val();
+      const BasicBlock *F2BB = F2BBs.pop_back_val();
+
+      // Check for control flow divergence
+      if (!enumerate(F1BB, F2BB))
+        goto not_mergeable;
+
+      const Instruction *F1TI = F1BB->getTerminator();
+      const Instruction *F2TI = F2BB->getTerminator();
+
+      // TODO: Implement merging of blocks with different numbers of
+      // instructions.
+      if (F1TI->getNumSuccessors() != F2TI->getNumSuccessors() ||
+          F1BB->size() != F2BB->size())
+        goto not_mergeable;
+
+      // The actual instruction-by-instruction comparison
+      if (!compare(F1BB, F2BB, &PHIsFound))
+        goto not_mergeable;
+
+      // FIXME: Count this in compare(F1BB,F2BB) so it doesn't include debug
+      // instructions.
+      InstructionCount += std::max(F1BB->size(), F2BB->size());
+
+      assert(F1TI->getNumSuccessors() == F2TI->getNumSuccessors());
+      for (unsigned i = 0, e = F1TI->getNumSuccessors(); i != e; ++i) {
+        if (!VisitedBBs.insert(F1TI->getSuccessor(i)).second)
+          continue;
+
+        F1BBs.push_back(F1TI->getSuccessor(i));
+        F2BBs.push_back(F2TI->getSuccessor(i));
+      }
+    }
+
+    BasicBlockCount = VisitedBBs.size();
+
+    // After we've seen all values and BBs, compare the PHI nodes
+    if (!comparePHIs(PHIsFound))
+      goto not_mergeable;
+  }
+
+  if (DifferingInstructions.size()) {
+    // Currently we can't merge vararg functions with differing instructions.
+    // TODO: Explore whether this is feasible; the difficult bit is the
+    // additional argument we need to add.
+    if (F1->isVarArg())
+      goto not_mergeable;
+
+    isDifferent = true;
+    DifferingInstructionsCount += DifferingInstructions.size();
+
+    LLVM_DEBUG(float Metric = ((float)InstructionCount - DifferingInstructionsCount)
+                         / InstructionCount*100;
+          dbgs() << "Similar fns: " << F1->getName() << " and " << F2->getName()
+                 << " bbs=" << BasicBlockCount << " insts=" << InstructionCount
+                 << " failed=" << DifferingInstructionsCount << " metric="
+                 << format("%0.2f", Metric)
+                 << '\n');
+  }
+
+  return true;
+
+not_mergeable:
+  // Fail: cannot merge the two functions
+  isNotMergeable = true;
+  return false;
+}
+
+bool FunctionComparator::isExactMatch() {
+  return (!isNotMergeable && !isDifferent);
+}
+
+bool FunctionComparator::isMergeCandidate() {
+  if (isNotMergeable)
+    return false;
+
+  if (!isDifferent)
+    return true;
+
+  // Heuristic when to attempt merging
+  if (InstructionCount > MergeDifferingMinInsts &&
+      DifferingInstructionsCount <= MergeMaxDiffering &&
+      getSimilarityMetric() > MergeMinSimilarity)
+    return true;
+
+  // Tolerate higher difference with higher similarity.
+  if (InstructionCount > 100 &&
+      DifferingInstructionsCount <= 60 &&
+      getSimilarityMetric() > 90 )
+    return true;
+
+  return false;
+}
+
+namespace {
+
+struct FunctionComparatorOrdering {
+  bool operator () (FunctionComparator *LHS, FunctionComparator *RHS) const {
+    unsigned MetricLHS = LHS->getSimilarityMetric(),
+             MetricRHS = RHS->getSimilarityMetric();
+
+    // If the metric is the same, then default to the unique ID. We need
+    // to use a unique value instead of the object address to ensure
+    // deterministic ordering.
+    if (MetricLHS == MetricRHS)
+      return LHS->getID() > RHS->getID();
+    return MetricLHS > MetricRHS;
+  }
+};
+
+class MergeRegistry {
+public:
+  typedef MapVector<unsigned, std::list<ComparableFunction> > FnCompareMap;
+  typedef std::set<FunctionComparator *, FunctionComparatorOrdering>
+    FnComparatorSet;
+  typedef std::map<Function *, FnComparatorSet> SimilarFnMap;
+
+  ~MergeRegistry();
+
+  void clear();
+
+  /// Defer a function for consideration in the next round.
+  void defer(Function *F);
+
+  /// Return true if we have deferred functions that can be enqueued.
+  bool haveDeferred() { return !Deferred.empty(); }
+
+  /// Move all the deferred functions into buckets to consider them for merging.
+  /// Returns number of functions that have been added.
+  unsigned enqueue();
+
+  /// Add a candidate for merging
+  void insertCandidate(FunctionComparator *Comp);
+
+  /// Remove a Function from the FnSet and queue it up for a second sweep of
+  /// analysis if Reanalyze is set. If it is a candidate for merging, remove it
+  /// from consideration.
+  void remove(Function *F, bool Reanalyze=true);
+
+  /// Return the similarity metric of the most similar function to F that is
+  /// not listed in the Ignore set.
+  unsigned getMaxSimilarity(Function *F, const DenseSet<Function *> &Ignore);
+
+  /// The collection of buckets that contain functions that may be similar to
+  /// each other (same hash value).
+  FnCompareMap FunctionsToCompare;
+
+  std::list<FunctionComparator *> FunctionsToMerge;
+  SimilarFnMap SimilarFunctions;
+
+private:
+  typedef std::vector<WeakVH> FnDeferredQueue;
+
+  /// A work queue of functions that may have been modified and should be
+  /// analyzed again.
+  FnDeferredQueue Deferred;
+};
+
+}  // end anonymous namespace
+
+MergeRegistry::~MergeRegistry() {
+  this->clear();
+}
+
+void MergeRegistry::clear() {
+  Deferred.clear();
+  SimilarFunctions.clear();
+  for (std::list<FunctionComparator *>::iterator
+        I = FunctionsToMerge.begin(), E = FunctionsToMerge.end();
+       I != E; ++I) {
+    FunctionComparator *FC = *I;
+    delete FC;
+  }
+  FunctionsToMerge.clear();
+  FunctionsToCompare.clear();
+}
+
+static bool isAliasCapable(Function* G) {
+  return
+    UseGlobalAliases && G->hasGlobalUnnamedAddr()
+    && (G->hasExternalLinkage() || G->hasLocalLinkage() || G->hasWeakLinkage());
+}
+
+static bool isComparisonCandidate(Function *F) {
+  if (Opt::MergeLevel == Opt::size) {
+    // Only consider functions that are to be optimized for size.
+    // By default, that is all functions at -Os/-Oz and nothing at -O2.
+    bool Os = F->getAttributes().
+      hasAttribute(AttributeList::FunctionIndex, Attribute::OptimizeForSize);
+    bool Oz = F->getAttributes().
+      hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+    if (!Os && !Oz)
+      return false;
+  }
+
+  // Ignore declarations and tiny functions - no point in merging those
+  if (F->isDeclaration()) return false;
+  if (F->getName().endswith(MERGED_SUFFIX)) return false;
+  if (F->hasAvailableExternallyLinkage()) return false;
+  if (F->hasFnAttribute(Attribute::AlwaysInline)) return false;
+  if (F->size() == 1 && F->begin()->size() < MergeMinInsts)
+    return isAliasCapable(F);
+
+  return true;
+}
+
+void MergeRegistry::defer(Function *F) {
+  if (isComparisonCandidate(F))
+    Deferred.push_back(F);
+}
+
+// Move functions from Deferred into buckets. remove() may have been called
+// multiple times for the same function, so eliminate duplicates using the
+// set. We reverse them because MergeSimilarFunctions::insert inserts at the
+// front of each bucket.
+unsigned MergeRegistry::enqueue() {
+  DenseSet<Function *> InsertedFuncs;
+
+  for (std::vector<WeakVH>::reverse_iterator
+      DefI = Deferred.rbegin(), DefE = Deferred.rend();
+      DefI != DefE; ++DefI) {
+    Value *V = *DefI;
+    Function *F = dyn_cast_or_null<Function>(V);
+    if (!F) continue;
+    if (InsertedFuncs.find(F) != InsertedFuncs.end()) continue;
+    if (!isComparisonCandidate(F)) continue;
+
+    unsigned Hash = profileFunction(F);
+    FunctionsToCompare[Hash].push_front(F);
+
+    InsertedFuncs.insert(F);
+  }
+
+  Deferred.clear();
+
+  return InsertedFuncs.size();
+}
+
+void MergeRegistry::insertCandidate(FunctionComparator *Comp) {
+  FunctionsToMerge.push_back(Comp);
+  SimilarFunctions[Comp->getF1()].insert(Comp);
+}
+
+static void removeFromBucket(Function *F,
+    std::list<ComparableFunction> &Bucket) {
+  for (std::list<ComparableFunction>::iterator
+      I = Bucket.begin(), E = Bucket.end(); I != E; ++I) {
+    if (I->getFunc() == F) {
+      Bucket.erase(I);
+      return;
+    }
+  }
+}
+
+void MergeRegistry::remove(Function *F, bool Reanalyze/*=true*/) {
+  // There is no need to remove a function that is not already
+  // in a bucket.
+  if (!isComparisonCandidate(F))
+    return;
+
+  unsigned Hash = profileFunction(F);
+  std::list<ComparableFunction> &Bucket = FunctionsToCompare[Hash];
+
+  removeFromBucket(F, Bucket);
+
+  if (Reanalyze)
+    Deferred.push_back(F);
+
+  // Check whether we have any existing FunctionComparator objects for this fn.
+  // If yes, discard them because F has changed. Retry merging for those
+  // functions by adding them to Deferred.
+  std::list<FunctionComparator *>::iterator I = FunctionsToMerge.begin();
+  while (I != FunctionsToMerge.end()) {
+    FunctionComparator *Comp = *I;
+    if (Comp->getF1() == F) {
+      Function *OtherF = Comp->getF2();
+      Deferred.push_back(OtherF);
+      removeFromBucket(OtherF, Bucket);
+      if (!SimilarFunctions[F].erase(Comp))
+        llvm_unreachable("Inconsistent SimilarFunctions set");
+      I = FunctionsToMerge.erase(I);
+      delete Comp;
+    } else if (Comp->getF2() == F) {
+      Function *OtherF = Comp->getF1();
+      Deferred.push_back(OtherF);
+      removeFromBucket(OtherF, Bucket);
+      if (!SimilarFunctions[OtherF].erase(Comp))
+        llvm_unreachable("Inconsistent SimilarFunctions set");
+      I = FunctionsToMerge.erase(I);
+      delete Comp;
+    } else {
+      ++I;
+    }
+  }
+}
+
+unsigned MergeRegistry::getMaxSimilarity(Function *F,
+    const DenseSet<Function *> &Ignore) {
+  FnComparatorSet &Similar = SimilarFunctions[F];
+
+  for (FnComparatorSet::iterator I = Similar.begin(), E = Similar.end();
+       I != E; ++I) {
+    FunctionComparator *Comp = *I;
+    if (Ignore.count(Comp->getF2()))
+      continue;
+
+    return Comp->getSimilarityMetric();
+  }
+
+  return 0;
+}
+
+namespace {
+
+class MergeSimilarFunctions : public ModulePass {
+public:
+  static char ID;
+  MergeSimilarFunctions(const ModuleSummaryIndex *Summary = nullptr)
+    : ModulePass(ID) {
+    initializeMergeSimilarFunctionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M);
+
+private:
+  /// Find the functions that use this Value and remove them from FnSet and
+  /// queue the functions.
+  void removeUsers(Value *V);
+
+  /// Replace all direct calls of Old with calls of New. Will bitcast New if
+  /// necessary to make types match.
+  void replaceDirectCallers(Function *Old, Function *New);
+
+  /// Process functions in the specified bucket, by either doing equiv merging
+  /// marking them for diff merging. Returns false if the bucket needs to be
+  /// re-scanned after an equiv merge. Sets Changed if the module was changed by
+  /// equiv merge.
+  bool mergeBucket(std::list<ComparableFunction> &Fns, bool &Changed);
+
+  /// Exhaustively compare all functions in each bucket and do equiv merging
+  /// where possible. Functions that have already been compared will not be
+  /// compared again. Returns true if the module was modified.
+  bool doExhaustiveCompareMerge();
+
+  /// Merge all the functions marked for diff merging. Returns true if the
+  /// module was modified.
+  bool doDiffMerge();
+
+  /// Merge two equivalent functions. Upon completion, G may be deleted, or may
+  /// be converted into a thunk. In either case, it should never be visited
+  /// again.
+  void mergeTwoFunctions(Function *F, Function *G);
+
+  /// Merge a set of functions with differences.
+  void outlineAndMergeFunctions(SmallVectorImpl<FunctionComparator *> &Fns);
+
+  /// Replace G with a thunk or an alias to F. Deletes G.
+  void writeThunkOrAlias(Function *F, Function *G);
+
+  /// Replace G with a simple tail call to bitcast(F). Also replace direct uses
+  /// of G with bitcast(F). Deletes G.
+  void writeThunk(Function *F, Function *G);
+
+  /// Replace G with a tail call to F with an additional argument.
+  ///
+  void writeThunkWithChoice(Function *NewF, Function *OldF, int Choice);
+
+  /// Replace G with an alias to F. Deletes G.
+  void writeAlias(Function *F, Function *G);
+
+  /// DataLayout for more accurate GEP comparisons. May be NULL.
+  const DataLayout *DL;
+
+  /// Merge registry. Stores all the information about functions being
+  /// considered for merging as well as current candidates for merging.
+  MergeRegistry Registry;
+
+};
+
+}  // end anonymous namespace
+
+char MergeSimilarFunctions::ID = 0;
+INITIALIZE_PASS(MergeSimilarFunctions, "mergesimilarfunc",
+                "Merge Similar Functions", false, false)
+
+ModulePass *
+llvm::createMergeSimilarFunctionsPass(const ModuleSummaryIndex *S) {
+  return new MergeSimilarFunctions(S);
+}
+
+bool MergeSimilarFunctions::runOnModule(Module &M) {
+  if (Opt::MergeLevel == Opt::none)
+    return false;
+
+  bool Changed = false;
+
+  DL = &M.getDataLayout();
+
+  for (auto &I : M)
+    Registry.defer(&I);
+
+  do {
+    unsigned InsertCount = Registry.enqueue();
+
+    LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n');
+    LLVM_DEBUG(dbgs() << "size of worklist: " << InsertCount << '\n');
+    (void)InsertCount;
+
+    Changed |= doExhaustiveCompareMerge();
+  } while (Registry.haveDeferred());
+
+  Changed |= doDiffMerge();
+
+  Registry.clear();
+  return Changed;
+}
+
+// Replace direct callers of Old with New.
+void MergeSimilarFunctions::replaceDirectCallers(Function *Old, Function *New) {
+  Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
+  for (Value::use_iterator UI = Old->use_begin(), UE = Old->use_end();
+       UI != UE;) {
+    Use *U = &*UI;
+    ++UI;
+    CallSite CS(U->getUser());
+    if (CS && CS.isCallee(U)) {
+      Registry.remove(CS.getInstruction()->getParent()->getParent());
+      U->set(BitcastNew);
+    }
+  }
+}
+
+// Replace G with an alias to F if possible, or else a thunk to F. Deletes G.
+void MergeSimilarFunctions::writeThunkOrAlias(Function *F, Function *G) {
+  if (isAliasCapable(G)) {
+    writeAlias(F, G);
+    return;
+  }
+
+  writeThunk(F, G);
+}
+
+static void writeThunkBody(Function *Thunk, Function *F,
+                           ConstantInt *Choice, const DataLayout *DL) {
+  BasicBlock *BB = &Thunk->getEntryBlock();
+  IRBuilder<> Builder(BB);
+
+  SmallVector<Value *, 16> Args;
+  unsigned i = 0;
+  FunctionType *FFTy = F->getFunctionType();
+  Type *IntPtrTy = DL->getIntPtrType(FFTy->getContext());
+  for (auto &AI : Thunk->args()) {
+    Value *Cast = createCastIfNeeded(&AI, FFTy->getParamType(i), BB, IntPtrTy, DL);
+    Args.push_back(Cast);
+    ++i;
+  }
+  if (Choice)
+    Args.push_back(Choice);
+
+  CallInst *CI = Builder.CreateCall(F, Args);
+  CI->setTailCall();
+  CI->setCallingConv(F->getCallingConv());
+  CI->setAttributes(F->getAttributes());
+  CI->setIsNoInline();
+  if (Thunk->getReturnType()->isVoidTy()) {
+    Builder.CreateRetVoid();
+  } else {
+    Type *RetTy = Thunk->getReturnType();
+    if (CI->getType()->isIntegerTy() && RetTy->isPointerTy())
+      Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy));
+    else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy())
+      Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy));
+    else {
+      Value *Cast = createCastIfNeeded(CI, RetTy, BB, IntPtrTy, DL);
+      Builder.CreateRet(Cast);
+    }
+  }
+}
+
+// Replace G with a simple tail call to bitcast(F). Also replace direct uses
+// of G with bitcast(F). Deletes G.
+void MergeSimilarFunctions::writeThunk(Function *F, Function *G) {
+  if (!G->isInterposable()) {
+    // Redirect direct callers of G to F.
+    replaceDirectCallers(G, F);
+  }
+
+  // If G was internal then we may have replaced all uses of G with F. If so,
+  // stop here and delete G. There's no need for a thunk.
+  if (G->hasLocalLinkage() && G->use_empty()) {
+    LLVM_DEBUG(dbgs() << "All uses of " << G->getName() << " replaced by "
+                 << F->getName() << ". Removing it.\n");
+    G->eraseFromParent();
+    return;
+  }
+
+  Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
+                                    G->getParent());
+  BasicBlock::Create(F->getContext(), "", NewG);
+
+  writeThunkBody(NewG, F, nullptr, DL);
+
+  NewG->copyAttributesFrom(G);
+  NewG->takeName(G);
+  removeUsers(G);
+  G->replaceAllUsesWith(NewG);
+  G->eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "writeThunk: " << NewG->getName() << " calling "
+               << F->getName() << '\n');
+  ++NumThunksWritten;
+}
+
+void MergeSimilarFunctions::writeThunkWithChoice(Function *NewF, Function *OldF,
+                                          int Choice) {
+  // Deleting the body of a function sets its linkage to External. Save the old
+  // one here and restore it at the end.
+  GlobalValue::LinkageTypes OldFLinkage = OldF->getLinkage();
+
+  // Delete OldF's body
+  OldF->deleteBody();
+  BasicBlock::Create(OldF->getContext(), "", OldF);
+
+  // Insert single BB with tail call
+  IntegerType *Int32Ty = Type::getInt32Ty(OldF->getContext());
+  ConstantInt *ChoiceConst = ConstantInt::get(Int32Ty, Choice);
+  writeThunkBody(OldF, NewF, ChoiceConst, DL);
+  OldF->setLinkage(OldFLinkage);
+}
+
+// Replace G with an alias to F and delete G.
+void MergeSimilarFunctions::writeAlias(Function *F, Function *G) {
+
+  // Replace all current uses of G in constants with F. This handles virtual
+  // table and other references. Do this first so that we don't modify thge
+  // global alias we're about to create.
+  SmallVector<Use *, 8> Uses;
+  for (auto I = G->use_begin(), E = G->use_end(); I != E; ++I) {
+    Use *U = I.operator->();
+    Constant *CV = dyn_cast<Constant>(U->getUser());
+    if (!CV) continue;
+    Uses.push_back(U);
+  }
+  for (auto I = Uses.begin(), E= Uses.end(); I != E; ++I) {
+    Use *U = *I;
+    U->set(F);
+  }
+
+  PointerType *PTy = G->getType();
+  auto *GA = GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                                 G->getLinkage(), "", F);
+  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+  GA->takeName(G);
+  GA->setVisibility(G->getVisibility());
+  removeUsers(G);
+  G->replaceAllUsesWith(GA);
+  G->eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n');
+  ++NumAliasesWritten;
+}
+
+// Merge two equivalent functions. Upon completion, Function G is deleted.
+void MergeSimilarFunctions::mergeTwoFunctions(Function *F, Function *G) {
+  if (F->isInterposable()) {
+    assert(G->isInterposable());
+
+    if (UseGlobalAliases) {
+      // Make them both thunks to the same internal function.
+      Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
+                                     F->getParent());
+      H->copyAttributesFrom(F);
+      H->takeName(F);
+      removeUsers(F);
+      F->replaceAllUsesWith(H);
+
+      unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment());
+
+      writeAlias(F, G);
+      writeAlias(F, H);
+
+      F->setAlignment(MaxAlignment);
+      F->setLinkage(GlobalValue::PrivateLinkage);
+    } else {
+      // We can't merge them. Instead, pick one and update all direct callers
+      // to call it and hope that we improve the instruction cache hit rate.
+      replaceDirectCallers(G, F);
+    }
+
+    ++NumDoubleWeak;
+  } else {
+    writeThunkOrAlias(F, G);
+  }
+
+  ++NumFunctionsMerged;
+}
+
+static Value *getLastArg(Function *F) {
+  auto it = F->arg_begin();
+  std::advance(it, F->arg_size()-1);
+  return it;
+}
+
+static void insertCondAndRemapInstructions(
+    Instruction *F1InstInNewF, const std::vector<const Instruction *> &F2Insts,
+    Function *NewF, ValueToValueMapTy &F1toNewF,
+    const SmallVectorImpl<FunctionComparator *> &Comps,
+    Type *IntPtrTy, const DataLayout *DL) {
+  assert(F2Insts.size() == Comps.size() &&
+      "Mis-match between F2Insts & Comps!");
+
+  SmallVector<Instruction *, 8> F2InstsInNewF;
+  for (unsigned FnI = 0, FnE = F2Insts.size(); FnI != FnE; ++FnI) {
+    const Instruction *F2Inst = F2Insts[FnI];
+    if (!F2Inst) {
+      F2InstsInNewF.push_back(NULL);
+      continue;
+    }
+
+    Instruction *F2InstInNewF = F2Inst->clone();
+
+    // Remap F2Inst: F2 values -> F1 values
+    RemapInstruction(F2InstInNewF, Comps[FnI]->getF2toF1Map(),
+                     RF_NoModuleLevelChanges);
+    // Remap F2Inst: F1 values -> NewF values
+    RemapInstruction(F2InstInNewF, F1toNewF, RF_NoModuleLevelChanges);
+
+    F2InstsInNewF.push_back(F2InstInNewF);
+  }
+
+  SmallVector<Instruction *, 8> Terminators;
+  SplitBlockAndInsertSwitch(getLastArg(NewF), F1InstInNewF,
+                            F2InstsInNewF, Terminators);
+
+  assert(Terminators.size() == F2InstsInNewF.size() + 1 &&
+      "Not enough terminators returned");
+
+  // F2InstsInNewF are now hooked up to the correct values in NewF. However,
+  // some of their operands may be pointers/integers so they could potentially
+  // have the wrong type in NewF (since we treat all pointers and integers of
+  // same size as equal). Insert casts if needed.
+  for (unsigned FnI = 0, FnE = F2InstsInNewF.size(); FnI != FnE; ++FnI) {
+    Instruction *F2InstInNewF = F2InstsInNewF[FnI];
+    if (!F2InstInNewF)
+      continue;
+    const Instruction *F2Inst = F2Insts[FnI];
+
+    for (unsigned OpId=0; OpId < F2InstInNewF->getNumOperands(); ++OpId) {
+      Value *F2NewFOperand = F2InstInNewF->getOperand(OpId);
+      Value *F2OrigOperand = F2Inst->getOperand(OpId);
+
+      if (F2NewFOperand->getType() != F2OrigOperand->getType()) {
+        Value *Cast = createCastIfNeeded(F2NewFOperand,
+            F2OrigOperand->getType(),
+            F2InstInNewF,
+            IntPtrTy, DL);
+        F2InstInNewF->setOperand(OpId, Cast);
+      }
+    }
+  }
+
+  if (ReturnInst *F1Ret = dyn_cast<ReturnInst>(F1InstInNewF)) {
+    // If we're handling differing return instructions, we need to ensure that
+    // they all return the same type. Since we treat pointer types as equal, we
+    // may need to insert a bitcast.
+    for (Instruction *F2Inst : F2InstsInNewF) {
+      if (!F2Inst)
+        continue;
+
+      // F2Inst must also be a return instruction due to control flow
+      // isomorphism.
+      ReturnInst *F2Ret = cast<ReturnInst>(F2Inst);
+
+      if (F2Ret->getReturnValue()->getType() !=
+          F1Ret->getReturnValue()->getType())
+        F2Ret->setOperand(0,
+                          createCastIfNeeded(F2Ret->getReturnValue(),
+                                             F1Ret->getReturnValue()->getType(),
+                                             F2Ret, IntPtrTy, DL));
+    }
+  } else if (!F1InstInNewF->use_empty()) {
+    // If the instructions have uses, we need to insert a PHI node.
+    //
+    // We treat all pointer types as equal, so we may need to insert
+    // a bitcast to ensure that all incoming values of the PHI node have the
+    // same type
+    Type *F1IType = F1InstInNewF->getType();
+    BasicBlock *TailBB = Terminators[0]->getSuccessor(0);
+    PHINode *Phi =
+        PHINode::Create(F1IType, F2InstsInNewF.size(), "", &TailBB->front());
+    F1InstInNewF->replaceAllUsesWith(Phi);
+
+    Phi->addIncoming(F1InstInNewF, F1InstInNewF->getParent());
+    for (unsigned FnI = 0, FnE = F2InstsInNewF.size(); FnI != FnE; ++FnI) {
+      Instruction *F2InstInNewF = F2InstsInNewF[FnI];
+      if (!F2InstInNewF)
+        continue;
+
+      if (F2InstInNewF->getType() != F1IType) {
+        assert(!F2InstInNewF->isTerminator() &&
+            "Cannot cast result of terminator instruction");
+
+        F2InstInNewF = cast<Instruction>(
+            createCastIfNeeded(F2InstInNewF,
+              F1IType,
+              Terminators[FnI+1],
+              IntPtrTy, DL));
+      }
+
+      Phi->addIncoming(F2InstInNewF, F2InstInNewF->getParent());
+    }
+  }
+}
+
+static void mergePHINode(const SmallVectorImpl<FunctionComparator *> &Fns,
+                         Function *NewF,
+                         ValueToValueMapTy &VMap, /* F1->FNew */
+                         const PHINode *F1PhiInst,
+                         std::vector<const Instruction *> F2PhiInsts) {
+  PHINode *F1PhiInNewF = dyn_cast<PHINode>(VMap[F1PhiInst]);
+  assert(F1PhiInNewF && "Cannot find F1Inst in NewF!");
+
+  // The incoming blocks in any of the F2PhiInsts may be in a different order.
+  // If this is the case, we have to reorder them. F2PhiInsts is intentionally a
+  // copy, so we can modify it
+  SmallVector<PHINode *, 4> GCInsts; // so we can delete them later.
+  for (unsigned FnI = 0, FnE = F2PhiInsts.size(); FnI != FnE; ++FnI) {
+    const PHINode *F2PhiInst = dyn_cast_or_null<const PHINode>(F2PhiInsts[FnI]);
+    if (!F2PhiInst)
+      continue;
+
+    for (unsigned I = 0, E = F1PhiInNewF->getNumIncomingValues(); I < E; ++I) {
+      if (!Fns[FnI]->enumerate(F1PhiInst->getIncomingBlock(I),
+                               F2PhiInst->getIncomingBlock(I))) {
+        // Non-equivalent blocks in the same position - need to reorder PhiInst
+        PHINode *ReorderedF2PhiInst = PHINode::Create(F2PhiInst->getType(), E);
+
+        for (unsigned II = 0; II < E; ++II) {
+          Value *BBVal =
+            Fns[FnI]->getF1toF2Map()[F1PhiInst->getIncomingBlock(II)];
+          BasicBlock *BB = cast<BasicBlock>(BBVal);
+          Value *Val = F2PhiInst->getIncomingValueForBlock(BB);
+          ReorderedF2PhiInst->addIncoming(Val, BB);
+        }
+
+        F2PhiInsts[FnI] = ReorderedF2PhiInst;
+        GCInsts.push_back(ReorderedF2PhiInst);
+        break;
+      }
+    }
+  }
+
+  // Now merge the PHI nodes.
+  for (unsigned i = 0; i < F1PhiInNewF->getNumIncomingValues(); ++i) {
+    Value *F1InValNewF = F1PhiInNewF->getIncomingValue(i),
+          *F1InVal = F1PhiInst->getIncomingValue(i);
+    BasicBlock *F1NewFInBlock = F1PhiInNewF->getIncomingBlock(i);
+    // If this is a repeat occurrence of the same incoming BasicBlock, we
+    // will have already dealt with it in a previous iteration.
+    if (F1PhiInNewF->getBasicBlockIndex(F1PhiInNewF->getIncomingBlock(i)) !=
+        (int)i)
+      continue;
+
+    Value *NewIncoming = F1InValNewF;
+
+    Instruction *InsertPt = F1NewFInBlock->getTerminator();
+
+    // Build up a chain of cmps and selects that pick the correct incoming
+    // value.
+    for (unsigned FnI = 0, FnE = F2PhiInsts.size(); FnI != FnE; ++FnI) {
+      if (!F2PhiInsts[FnI])
+        continue;
+      const PHINode *F2PhiInst = cast<const PHINode>(F2PhiInsts[FnI]);
+      Value *F2InVal = F2PhiInst->getIncomingValue(i);
+
+      // If we know these are equivalent, there's no further work to do
+      if (Fns[FnI]->enumerate(F1InVal, F2InVal,/*NoSelfRef=*/true) &&
+          Fns[FnI]->enumerate(F1PhiInst->getIncomingBlock(i),
+            F2PhiInst->getIncomingBlock(i)))
+        continue;
+
+      assert(Fns[FnI]->enumerate(F1PhiInst->getIncomingBlock(i),
+                                 F2PhiInst->getIncomingBlock(i)) &&
+             "Non-equivalent incoming BBs in PHI.");
+
+      // We have different incoming values from the same block
+      // Translate F2's incoming value to NewF if needed
+      Value *F2InValNewF = F2InVal;
+      if (!isa<Constant>(F2InVal)) {
+        Value *V = Fns[FnI]->getF2toF1Map()[F2InVal]; // F2->F1
+        F2InValNewF = VMap[V]; // F1->NewF
+        assert(V && F2InValNewF && "Cannot map F2InVal to NewF");
+      }
+
+      // Cast F2InValNewF to the correct type if needed
+      LLVMContext &Ctx = F1InValNewF->getType()->getContext();
+      const DataLayout *FTD = Fns[FnI]->getDataLayout();
+      Type *IntPtrTy = FTD ? FTD->getIntPtrType(Ctx) : NULL;
+      F2InValNewF = createCastIfNeeded(F2InValNewF, F1InValNewF->getType(),
+                                       InsertPt, IntPtrTy, FTD);
+
+      // Create compare & select
+      Value *ChoiceArg = getLastArg(NewF);
+      Value *SelectBit = new ICmpInst(InsertPt,
+                                      ICmpInst::ICMP_EQ,
+                                      getLastArg(NewF),
+                                      ConstantInt::get(ChoiceArg->getType(),
+                                                       FnI+1));
+
+      // SelectBit true -> F2InValNewF, SelectBit false -> existing NewIncoming.
+      NewIncoming = SelectInst::Create(SelectBit, F2InValNewF, NewIncoming, "",
+          InsertPt);
+    }
+
+    if (NewIncoming == F1InValNewF)
+      continue; // no change for this incoming value
+
+    // Replace all occurrences of this incoming value/block by the new
+    // ones (phi nodes can have repeated arguments)
+    for (unsigned j=i; j < F1PhiInNewF->getNumIncomingValues(); ++j) {
+      if (F1PhiInNewF->getIncomingBlock(j) == F1NewFInBlock) {
+        F1PhiInNewF->setIncomingValue(j, NewIncoming);
+      }
+    }
+  }
+
+  // Garbage-collect the reordered PHI nodes we temporarily created.
+  for (SmallVectorImpl<PHINode *>::iterator I = GCInsts.begin(),
+        E = GCInsts.end(); I != E; ++I)
+    delete *I;
+}
+
+static bool rewriteRecursiveCall(
+    const CallInst *F1I, const CallInst *F2I, CallInst *NewFI,
+    const Function *F1, const Function *F2, Function *NewF) {
+  if (!(F1I->getCalledFunction() == F1 && F2I->getCalledFunction() == F2) &&
+      !(F1I->getCalledFunction() == F2 && F2I->getCalledFunction() == F1))
+    return false; // not a recursive/mutually recursive call
+
+  // Replace NewFI by recursive call to NewF with additional choice argument
+  SmallVector<Value *, 16> Args;
+  for (unsigned AI = 0, End = NewFI->getNumArgOperands(); AI < End; ++AI) {
+    Value *Arg = NewFI->getArgOperand(AI);
+
+    // Check if F1 or F2 is one of the arguments (veeery unusual case, don't
+    // handle it for now).
+    if (Arg == F1 || Arg == F2)
+      return false;
+
+    Args.push_back(Arg);
+  }
+
+  if (F1I->getCalledFunction() == F1 && F2I->getCalledFunction() == F2) {
+    Args.push_back(getLastArg(NewF));
+  } else {
+    // Need to invert the choice argument
+    Value *ChoiceArg = getLastArg(NewF);
+    Constant *One = ConstantInt::get(ChoiceArg->getType(), 1);
+    Args.push_back(BinaryOperator::Create(Instruction::Xor, ChoiceArg, One, "",
+          NewFI));
+  }
+
+  CallInst *CI = CallInst::Create(NewF, Args);
+  CI->setCallingConv(NewF->getCallingConv());
+
+  ReplaceInstWithInst(NewFI, CI);
+
+  return true;
+}
+
+/// Clone F1 into a new function with F1's name + MERGE_SUFFIX. Adds an
+/// additional i32 argument to the function.
+static Function *cloneAndAddArgument(Function *F1, ValueToValueMapTy &VMap) {
+  LLVMContext &Context = F1->getContext();
+
+  std::vector<Type*> ArgTypes;
+  for (const auto &Arg : F1->args())
+    ArgTypes.push_back(Arg.getType());
+  ArgTypes.push_back(Type::getInt32Ty(Context));
+
+  FunctionType *FTy = FunctionType::get(F1->getFunctionType()->getReturnType(),
+                                        ArgTypes,
+                                        F1->getFunctionType()->isVarArg());
+  Function *NewF = Function::Create(FTy, F1->getLinkage(),
+                                    F1->getName()+MERGED_SUFFIX);
+
+  insertFunctionAfter(NewF, F1);
+
+  if (F1->hasSection())
+    NewF->setSection(F1->getSection());
+
+  if (F1->getFunctionType()->isVarArg())
+    NewF->setCallingConv(CallingConv::C);
+  else
+    NewF->setCallingConv(CallingConv::Fast);
+
+  Function::arg_iterator DestI = NewF->arg_begin();
+  for (auto &Arg : F1->args()) {
+    Argument *DestIn = &*DestI;
+    DestIn->setName(Arg.getName()); // Copy the name over...
+    VMap[&Arg] = DestIn;            // Add mapping to VMap
+    ++DestI;
+  }
+
+  // Name the selector argument
+  (*DestI).setName("__merge_arg");
+
+  SmallVector<ReturnInst*, 8> Returns;
+  CloneFunctionInto(NewF, F1, VMap, CloneType::ExtractingFunctions, Returns);
+  // Set linkage to set visibility to default.
+  NewF->setLinkage(GlobalValue::InternalLinkage);
+
+  return NewF;
+}
+
+typedef MapVector<const Instruction *, std::vector<const Instruction *> >
+  CombinedDiffMap;
+
+void MergeSimilarFunctions::outlineAndMergeFunctions(
+    SmallVectorImpl<FunctionComparator *> &Fns) {
+  assert(!Fns.empty() && "Cannot merge empty set of functions");
+
+  // All comparator instances in Fns share the same F1
+  Function *F1 = Fns.front()->getF1();
+
+  // Clone F1 into new function with an additional i32 argument
+  ValueToValueMapTy VMap; // maps F1 values -> NewF values
+  Function *NewF = cloneAndAddArgument(F1, VMap);
+
+  // Combine all the DifferingInstructions maps in Fns into one single map of
+  // lists to aid the merging process.
+  //
+  // Map F1 instruction -> list of F2 instructions indexed by position in Fns.
+  CombinedDiffMap AllDifferingInstructions;
+  for (unsigned I = 0, E = Fns.size(); I != E; ++I) {
+    FunctionComparator *Comp = Fns[I];
+    for (MapVector<const Instruction *, const Instruction *>::iterator
+          DiffI = Comp->DifferingInstructions.begin(),
+          DiffE = Comp->DifferingInstructions.end();
+         DiffI != DiffE; ++DiffI) {
+      AllDifferingInstructions[DiffI->first].resize(Fns.size());
+      AllDifferingInstructions[DiffI->first][I] = DiffI->second;
+    }
+  }
+
+  // Merge differing PHI nodes. We need to handle these first because they could
+  // be affected later on when we split basic blocks, thus making them
+  // impossible to merge.
+  for (CombinedDiffMap::const_iterator I = AllDifferingInstructions.begin(),
+         E = AllDifferingInstructions.end();
+       I != E; ++I) {
+    const PHINode *F1PhiInst = dyn_cast<PHINode>(I->first);
+    if (!F1PhiInst)
+      continue;
+
+    const std::vector<const Instruction *> &F2PhiInsts = I->second;
+
+    mergePHINode(Fns, NewF, VMap, F1PhiInst, F2PhiInsts);
+  }
+
+  // Merge recursive calls
+  //
+  // TODO: We currently only support this optimization for pairs of functions.
+  // If more than two functions are merged, we mark the recursive calls as
+  // DifferingInstructions which causes switch statements to be inserted and
+  // recursive calls going through thunks. It wouldn't be too hard to implement
+  // self-recursive calls for multi-merges. *Mutually* recursive calls with
+  // multi-merges are a little trickier - that's why we leave it for now.
+  if (Fns.size() == 1) {
+    FunctionComparator *Comp = Fns.front();
+    for (MapVector<const Instruction *, const Instruction *>::const_iterator
+        I = Comp->SelfRefInstructions.begin(),
+        E = Comp->SelfRefInstructions.end();
+        I != E; ++I) {
+      const Instruction *F1I = I->first;
+      if (Comp->DifferingInstructions.count(F1I))
+        continue; // Differing in other ways too, so deal with it later.
+
+      // Attempt recursive call rewriting
+      if (isa<CallInst>(F1I)) {
+        const CallInst *F1Call = cast<const CallInst>(F1I);
+        const CallInst *F2Call = dyn_cast<const CallInst>(I->second);
+        CallInst *NewFCall = dyn_cast<CallInst>(VMap[F1I]);
+
+        if (F1Call && F2Call && NewFCall &&
+            rewriteRecursiveCall(F1Call, F2Call, NewFCall,
+                                 Comp->getF1(), Comp->getF2(), NewF))
+          continue;
+      }
+
+      // Can't rewrite it. Mark as differing and insert conditional later
+      Comp->DifferingInstructions[F1I] = I->second;
+    }
+  } else {
+    for (unsigned I = 0, E = Fns.size(); I != E; ++I) {
+      FunctionComparator *Comp = Fns[I];
+      for (MapVector<const Instruction *, const Instruction *>::const_iterator
+          II = Comp->SelfRefInstructions.begin(),
+          EE = Comp->SelfRefInstructions.end();
+          II != EE; ++II) {
+        const Instruction *F1I = II->first;
+        if (Comp->DifferingInstructions.count(F1I))
+          continue; // Differing in other ways too, so deal with it later.
+
+        AllDifferingInstructions[F1I].resize(Fns.size());
+        AllDifferingInstructions[F1I][I] = II->second;
+      }
+    }
+  }
+
+  // For each differing instruction, splice basic block, and insert conditional
+  LLVMContext &Context = NewF->getContext();
+  Type *IntPtrType = DL->getIntPtrType(Context);
+  for (CombinedDiffMap::const_iterator I = AllDifferingInstructions.begin(),
+         E = AllDifferingInstructions.end();
+       I != E; ++I) {
+    const Instruction *F1Inst = I->first;
+    const std::vector<const Instruction *> &F2Insts = I->second;
+
+    assert(VMap.find(F1Inst) != VMap.end() &&
+      "Cannot find differing inst!");
+    Instruction *F1InstInNewF = cast<Instruction>(VMap[F1Inst]);
+
+    if (isa<PHINode>(F1InstInNewF))
+      continue; // we already handled these above
+
+    insertCondAndRemapInstructions(F1InstInNewF, F2Insts,
+      NewF, VMap, Fns, IntPtrType, DL);
+  }
+
+  // Replace functions with thunks
+  PrintMerges("FNSM", F1, NewF);
+  writeThunkWithChoice(NewF, F1, 0);
+  for (unsigned FnI = 0, FnE = Fns.size(); FnI != FnE; ++FnI) {
+    Function *F2 = Fns[FnI]->getF2();
+    PrintMerges("FNSM", F2, NewF);
+    writeThunkWithChoice(NewF, F2, FnI + 1);
+  }
+  NumSimilarFunctionsMerged += Fns.size() + 1;
+}
+
+// For each instruction used by the value, remove() the function that contains
+// the instruction. This should happen right before a call to RAUW.
+void MergeSimilarFunctions::removeUsers(Value *V) {
+  std::vector<Value *> Worklist;
+  Worklist.push_back(V);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.back();
+    Worklist.pop_back();
+
+    for (User *U : V->users()) {
+      if (Instruction *I = dyn_cast<Instruction>(U)) {
+        Registry.remove(I->getParent()->getParent());
+      } else if (isa<GlobalValue>(U)) {
+        // do nothing
+      } else if (Constant *C = dyn_cast<Constant>(U)) {
+        for (User *UU : C->users())
+          Worklist.push_back(UU);
+      }
+    }
+  }
+}
+
+bool MergeSimilarFunctions::mergeBucket(std::list<ComparableFunction> &Fns,
+    bool &Changed) {
+  for (std::list<ComparableFunction>::iterator FnI = Fns.begin(),
+      FnE = Fns.end(); FnI != FnE; ++FnI) {
+    if (!FnI->isNew())
+      continue;
+
+    if (!FnI->getFunc())
+      continue;
+
+    SmallVector<FunctionComparator *, 8> DiffMergeCandidates;
+
+    std::list<ComparableFunction>::iterator Fn2I = FnI;
+    for (++Fn2I; Fn2I != FnE; ++Fn2I) {
+      if (!Fn2I->getFunc())
+        continue;
+
+      assert(FnI->getFunc() != Fn2I->getFunc() &&
+          "Duplicate function in list!");
+
+      FunctionComparator *Comp = new FunctionComparator(DL, FnI->getFunc(),
+          Fn2I->getFunc());
+
+      if (!Comp->compare() || !Comp->isMergeCandidate()) {
+        delete Comp;
+        continue;
+      }
+
+      // Never thunk a strong function to a weak function.
+      assert(!FnI->getFunc()->isInterposable() ||
+          Fn2I->getFunc()->isInterposable());
+
+      if (Comp->isExactMatch()) {
+        // Equiv merge the two functions. Throw away any diff merge
+        // candidate we might have found so far.
+        delete Comp;
+
+        LLVM_DEBUG(dbgs() << "- Equiv merge " << FnI->getFunc()->getName()
+            << " == " << Fn2I->getFunc()->getName() << '\n');
+
+        PrintMerges("FNEQ", FnI->getFunc(), Fn2I->getFunc());
+
+        Function *DeleteF = Fn2I->getFunc();
+        Registry.remove(DeleteF, /*reanalyze=*/false);
+
+        mergeTwoFunctions(FnI->getFunc(), DeleteF);
+
+        Changed = true;
+
+        // mergeTwoFunctions may have removed functions from this bucket and
+        // invalidated the iterators. Rescan the whole bucket, continuing
+        // from the current function (previous ones will have been
+        // markCompared())
+        for (SmallVector<FunctionComparator *, 8>::iterator
+            I = DiffMergeCandidates.begin(), E = DiffMergeCandidates.end();
+           I != E; ++I)
+          delete *I;
+
+        return false;
+      } else {
+        DiffMergeCandidates.push_back(Comp);
+      }
+    }
+
+    if (!DiffMergeCandidates.empty()) {
+      // Add to our list of candidates for diff merging
+      for (SmallVector<FunctionComparator *, 8>::iterator
+            I = DiffMergeCandidates.begin(), E = DiffMergeCandidates.end();
+           I != E; ++I) {
+        Registry.insertCandidate(*I);
+      }
+    }
+
+    FnI->markCompared();
+  }
+
+  return true;
+}
+
+bool MergeSimilarFunctions::doExhaustiveCompareMerge() {
+  bool Changed = false;
+
+  // Process buckets with strong functions first.
+  for (MergeRegistry::FnCompareMap::iterator
+        BucketsI = Registry.FunctionsToCompare.begin(),
+        BucketsE = Registry.FunctionsToCompare.end();
+       BucketsI != BucketsE; ++BucketsI) {
+    std::list<ComparableFunction> &Fns = BucketsI->second;
+    if (Fns.size() < 2 || Fns.front().getFunc()->isInterposable())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Processing strong bucket " << BucketsI->first << " with "
+                 << Fns.size() << " functions\n");
+    // Repeatedly scan this bucket, until we find no more functions to equiv
+    // merge.
+    while (!mergeBucket(Fns, Changed) && Fns.size() > 1) {
+      LLVM_DEBUG(dbgs() << "Rescanning bucket.\n");
+    }
+  }
+
+  // Process buckets with weak functions.
+  for (MergeRegistry::FnCompareMap::iterator
+        BucketsI = Registry.FunctionsToCompare.begin(),
+        BucketsE = Registry.FunctionsToCompare.end();
+       BucketsI != BucketsE; ++BucketsI) {
+    std::list<ComparableFunction> &Fns = BucketsI->second;
+    if (Fns.size() < 2 || !Fns.front().getFunc()->isInterposable())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Processing weak bucket " << BucketsI->first << " with "
+                 << Fns.size() << " functions\n");
+    // Repeatedly scan this bucket, until we find no more functions to equiv
+    // merge.
+    while (!mergeBucket(Fns, Changed) && Fns.size() > 1) {
+      LLVM_DEBUG(dbgs() << "Rescanning bucket.\n");
+    }
+  }
+
+  return Changed;
+}
+
+static bool orderComparatorsByMetric(FunctionComparator *Cmp1,
+                                     FunctionComparator *Cmp2) {
+  return (Cmp1->getSimilarityMetric() > Cmp2->getSimilarityMetric());
+}
+
+bool MergeSimilarFunctions::doDiffMerge() {
+  if (Registry.FunctionsToMerge.empty())
+    return false;
+
+  bool Changed = false;
+  DenseSet<Function *> MergedFns; // Functions that have already been merged
+  Registry.FunctionsToMerge.sort(orderComparatorsByMetric);
+
+  for (std::list<FunctionComparator *>::iterator
+        I = Registry.FunctionsToMerge.begin(),
+        E = Registry.FunctionsToMerge.end();
+       I != E; ++I) {
+    FunctionComparator *Comp = *I;
+    Function *F1 = Comp->getF1();
+    // Ignore it if we've already merged this fn
+    if (MergedFns.count(F1) || MergedFns.count(Comp->getF2()))
+      continue;
+
+    assert(Registry.SimilarFunctions.count(F1) &&
+        "Comparator doesn't exist in SimilarFunctions map");
+
+    // Look at all functions F that F1 could be merged with. Merge with each F,
+    // unless there is another function F' that is more similar to F than F1.
+    MergeRegistry::FnComparatorSet &SimilarFns = Registry.SimilarFunctions[F1];
+    SmallVector<FunctionComparator *, 4> CurrentMerge;
+
+    for (MergeRegistry::FnComparatorSet::iterator
+          CandidateI = SimilarFns.begin(), CandidateE = SimilarFns.end();
+        CandidateI != CandidateE; ++CandidateI) {
+      FunctionComparator *Comp2 = *CandidateI;
+      assert(Comp2->getF1() == F1 && "Inconsistency in SimilarFunctions");
+      Function *F2 = Comp2->getF2();
+
+      // Ignore it if we've already merged this fn
+      if (MergedFns.count(F2))
+        continue;
+
+      // Check whether there is a better merge candidate for F2
+      if (Registry.getMaxSimilarity(F2, MergedFns) >
+          Comp2->getSimilarityMetric())
+        continue;
+
+      // Ok, we actually want to merge with F2
+      CurrentMerge.push_back(Comp2);
+      MergedFns.insert(F2);
+    }
+
+    if (CurrentMerge.empty())
+      continue;
+
+    MergedFns.insert(F1);
+
+    NumMultiMerged += CurrentMerge.size();
+
+    LLVM_DEBUG(dbgs() << "- Multi merge of " << F1->getName() << " with "
+                 << CurrentMerge.size() << " functions.\n");
+
+    Changed = true;
+    outlineAndMergeFunctions(CurrentMerge);
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -153,6 +153,10 @@
     EnableMatrix("enable-matrix", cl::init(false), cl::Hidden,
                  cl::desc("Enable lowering of the matrix intrinsics"));
 
+static cl::opt<bool> EnableMergeSimilarFunctions(
+    "enable-merge-sim-functions", cl::init(false), cl::Hidden,
+    cl::desc("Enable the Function merging pass (default = on)"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -609,6 +613,11 @@
     MPM.add(createOpenMPOptLegacyPass());
 
   MPM.add(createPostOrderFunctionAttrsLegacyPass());
+  if (EnableMergeSimilarFunctions) {
+    auto *Summary = (ImportSummary ? ImportSummary : ExportSummary);
+    MPM.add(createMergeSimilarFunctionsPass(Summary));
+  }
+
   if (OptLevel > 2)
     MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
 
@@ -820,6 +829,9 @@
   if (MergeFunctions)
     MPM.add(createMergeFunctionsPass());
 
+  if (EnableMergeSimilarFunctions)
+    MPM.add(createMergeSimilarFunctionsPass());
+
   // LoopSink pass sinks instructions hoisted by LICM, which serves as a
   // canonicalization pass that enables other optimizations. As a result,
   // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
@@ -1047,6 +1059,8 @@
   // currently it damages debug info.
   if (MergeFunctions)
     PM.add(createMergeFunctionsPass());
+  if (EnableMergeSimilarFunctions)
+    PM.add(createMergeSimilarFunctionsPass());
 }
 
 void PassManagerBuilder::populateThinLTOPassManager(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4566,6 +4566,10 @@
     if (I) return eraseInstFromFunction(*I);
   }
 
+  if (!Call.use_empty() && !Call.isMustTailCall())
+    if (Value *ReturnedArg = Call.getReturnedArgOperand())
+      return replaceInstUsesWith(Call, ReturnedArg);
+
   if (isAllocLikeFn(&Call, &TLI))
     return visitAllocSite(Call);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2760,6 +2760,12 @@
   return nullptr;
 }
 
+static bool isMustTailCall(Value *V) {
+  if (auto *CI = dyn_cast<CallInst>(V))
+    return CI->isMustTailCall();
+  return false;
+}
+
 Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
   if (RI.getNumOperands() == 0) // ret void
     return nullptr;
@@ -2769,6 +2775,10 @@
   if (!VTy->isIntegerTy() || isa<Constant>(ResultOp))
     return nullptr;
 
+  // Don't replace result of musttail calls.
+  if (isMustTailCall(ResultOp))
+    return nullptr;
+
   // There might be assume intrinsics dominating this return that completely
   // determine the value. If so, constant fold it.
   KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
@@ -3484,7 +3494,8 @@
     // In general, it is possible for computeKnownBits to determine all bits in
     // a value even when the operands are not all constants.
     Type *Ty = I->getType();
-    if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) {
+    if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy() &&
+        !isMustTailCall(I)) {
       KnownBits Known = computeKnownBits(I, /*Depth*/0, I);
       if (Known.isConstant()) {
         Constant *C = ConstantInt::get(Ty, Known.getConstant());
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -42,7 +43,6 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -1487,6 +1487,9 @@
   SmallPtrSet<const Value *, 16> InvisibleToCaller;
   // Keep track of blocks with throwing instructions not modeled in MemorySSA.
   SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
+  // Post-order numbers for each basic block. Used to figure out if memory
+  // accesses are executed before another access.
+  DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
 
   /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
   /// basic block.
@@ -1502,23 +1505,28 @@
     DSEState State(F, AA, MSSA, DT, PDT, TLI);
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
-    for (Instruction &I : instructions(F)) {
-      if (I.mayThrow() && !MSSA.getMemoryAccess(&I))
-        State.ThrowingBlocks.insert(I.getParent());
-
-      auto *MD = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(&I));
-      if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
-          hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I))
-        State.MemDefs.push_back(MD);
-
-      // Track alloca and alloca-like objects. Here we care about objects not
-      // visible to the caller during function execution. Alloca objects are
-      // invalid in the caller, for alloca-like objects we ensure that they are
-      // not captured throughout the function.
-      if (isa<AllocaInst>(&I) ||
-          (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true)))
-        State.InvisibleToCaller.insert(&I);
+    unsigned PO = 0;
+    for (BasicBlock *BB : post_order(&F)) {
+      State.PostOrderNumbers[BB] = PO++;
+      for (Instruction &I : *BB) {
+        if (I.mayThrow() && !MSSA.getMemoryAccess(&I))
+          State.ThrowingBlocks.insert(I.getParent());
+
+        auto *MD = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(&I));
+        if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
+            hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I))
+          State.MemDefs.push_back(MD);
+
+        // Track alloca and alloca-like objects. Here we care about objects not
+        // visible to the caller during function execution. Alloca objects are
+        // invalid in the caller, for alloca-like objects we ensure that they
+        // are not captured throughout the function.
+        if (isa<AllocaInst>(&I) ||
+            (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true)))
+          State.InvisibleToCaller.insert(&I);
+      }
     }
+
     // Treat byval or inalloca arguments the same as Allocas, stores to them are
     // dead at the end of the function.
     for (Argument &AI : F.args())
@@ -1593,16 +1601,13 @@
   // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
   // read access in between or return None otherwise. The returned value may not
   // (completely) overwrite \p DefLoc. Currently we bail out when we encounter
-  // any of the following
-  //  * An aliasing MemoryUse (read).
-  //  * A MemoryPHI.
+  // an aliasing MemoryUse (read).
   Optional<MemoryAccess *> getDomMemoryDef(MemoryDef *KillingDef,
                                            MemoryAccess *Current,
                                            MemoryLocation DefLoc,
                                            bool DefVisibleToCaller,
                                            int &ScanLimit) const {
-    MemoryDef *DomDef;
-    MemoryAccess *StartDef = Current;
+    MemoryAccess *DomAccess;
     bool StepAgain;
     LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
                       << "\n");
@@ -1613,37 +1618,44 @@
       if (MSSA.isLiveOnEntryDef(Current))
         return None;
 
-      MemoryUseOrDef *CurrentUD = dyn_cast<MemoryUseOrDef>(Current);
-      if (!CurrentUD)
-        return None;
-
+      if (isa<MemoryPhi>(Current)) {
+        DomAccess = Current;
+        break;
+      }
+      MemoryUseOrDef *CurrentUD = cast<MemoryUseOrDef>(Current);
       // Look for access that clobber DefLoc.
-      MemoryAccess *DomAccess =
-          MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(
-              CurrentUD->getDefiningAccess(), DefLoc);
-      DomDef = dyn_cast<MemoryDef>(DomAccess);
-      if (!DomDef || MSSA.isLiveOnEntryDef(DomDef))
+      DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD,
+                                                                      DefLoc);
+      if (MSSA.isLiveOnEntryDef(DomAccess))
         return None;
 
+      if (isa<MemoryPhi>(DomAccess))
+        break;
+
       // Check if we can skip DomDef for DSE. We also require the KillingDef
       // execute whenever DomDef executes and use post-dominance to ensure that.
-      if (canSkipDef(DomDef, DefVisibleToCaller) ||
+
+      MemoryDef *DomDef = dyn_cast<MemoryDef>(DomAccess);
+      if ((DomDef && canSkipDef(DomDef, DefVisibleToCaller)) ||
           !PDT.dominates(KillingDef->getBlock(), DomDef->getBlock())) {
         StepAgain = true;
-        Current = DomDef;
+        Current = DomDef->getDefiningAccess();
       }
 
     } while (StepAgain);
 
-    LLVM_DEBUG(dbgs() << "  Checking for reads of " << *DomDef << " ("
-                      << *DomDef->getMemoryInst() << ")\n");
+    LLVM_DEBUG({
+      dbgs() << "  Checking for reads of " << *DomAccess;
+      if (isa<MemoryDef>(DomAccess))
+        dbgs() << " (" << *cast<MemoryDef>(DomAccess)->getMemoryInst() << ")\n";
+    });
 
     SmallSetVector<MemoryAccess *, 32> WorkList;
     auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
       for (Use &U : Acc->uses())
         WorkList.insert(cast<MemoryAccess>(U.getUser()));
     };
-    PushMemUses(DomDef);
+    PushMemUses(DomAccess);
 
     // Check if DomDef may be read.
     for (unsigned I = 0; I < WorkList.size(); I++) {
@@ -1655,10 +1667,9 @@
         return None;
       }
 
-      // Bail out on MemoryPhis for now.
       if (isa<MemoryPhi>(UseAccess)) {
-        LLVM_DEBUG(dbgs() << "  ...  hit MemoryPhi\n");
-        return None;
+        PushMemUses(UseAccess);
+        continue;
       }
 
       Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
@@ -1676,7 +1687,11 @@
         return None;
       }
 
-      if (StartDef == UseAccess)
+      // For the KillingDef we only have to check if it reads the memory
+      // location.
+      // TODO: It would probably be better to check for self-reads before
+      // calling the function.
+      if (KillingDef == UseAccess)
         continue;
 
       // Check all uses for MemoryDefs, except for defs completely overwriting
@@ -1695,8 +1710,8 @@
       }
     }
 
-    // No aliasing MemoryUses of DomDef found, DomDef is potentially dead.
-    return {DomDef};
+    // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead.
+    return {DomAccess};
   }
 
   // Delete dead memory defs
@@ -1788,10 +1803,10 @@
   DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
   // For each store:
   for (unsigned I = 0; I < State.MemDefs.size(); I++) {
-    MemoryDef *Current = State.MemDefs[I];
-    if (State.SkipStores.count(Current))
+    MemoryDef *KillingDef = State.MemDefs[I];
+    if (State.SkipStores.count(KillingDef))
       continue;
-    Instruction *SI = cast<MemoryDef>(Current)->getMemoryInst();
+    Instruction *SI = KillingDef->getMemoryInst();
     auto MaybeSILoc = State.getLocForWriteEx(SI);
     if (!MaybeSILoc) {
       LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
@@ -1808,22 +1823,54 @@
                     !PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT))))
       DefVisibleToCaller = false;
 
-    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " << *SI
-                      << "\n");
+    MemoryAccess *Current = KillingDef;
+    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
+                      << *KillingDef << " (" << *SI << ")\n");
 
     int ScanLimit = MemorySSAScanLimit;
-    MemoryDef *StartDef = Current;
-    // Walk MemorySSA upward to find MemoryDefs that might be killed by SI.
-    while (Optional<MemoryAccess *> Next = State.getDomMemoryDef(
-               StartDef, Current, SILoc, DefVisibleToCaller, ScanLimit)) {
+    // Worklist of MemoryAccesses that may be killed by KillingDef.
+    SetVector<MemoryAccess *> ToCheck;
+    ToCheck.insert(KillingDef->getDefiningAccess());
+
+    // Check if MemoryAccesses in the worklist are killed by KillingDef.
+    for (unsigned I = 0; I < ToCheck.size(); I++) {
+      Current = ToCheck[I];
+      if (State.SkipStores.count(Current))
+        continue;
+
+      Optional<MemoryAccess *> Next = State.getDomMemoryDef(
+          KillingDef, Current, SILoc, DefVisibleToCaller, ScanLimit);
+
+      if (!Next) {
+        LLVM_DEBUG(dbgs() << "  finished walk\n");
+        continue;
+      }
+
       MemoryAccess *DomAccess = *Next;
       LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess << "\n");
+      if (isa<MemoryPhi>(DomAccess)) {
+        for (Value *V : cast<MemoryPhi>(DomAccess)->incoming_values()) {
+          MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
+          BasicBlock *IncomingBlock = IncomingAccess->getBlock();
+          BasicBlock *PhiBlock = DomAccess->getBlock();
+
+          // We only consider incoming MemoryAccesses that come before the
+          // MemoryPhi. Otherwise we could discover candidates that do not
+          // strictly dominate our starting def.
+          if (State.PostOrderNumbers[IncomingBlock] >
+              State.PostOrderNumbers[PhiBlock])
+            ToCheck.insert(IncomingAccess);
+        }
+        continue;
+      }
       MemoryDef *NextDef = dyn_cast<MemoryDef>(DomAccess);
       Instruction *NI = NextDef->getMemoryInst();
       LLVM_DEBUG(dbgs() << "  def " << *NI << "\n");
 
-      if (!hasAnalyzableMemoryWrite(NI, TLI))
-        break;
+      if (!hasAnalyzableMemoryWrite(NI, TLI)) {
+        LLVM_DEBUG(dbgs() << " skip, cannot analyze def\n");
+        continue;
+      }
 
       if (!isRemovable(NI)) {
         LLVM_DEBUG(dbgs() << " skip, cannot remove def\n");
@@ -1834,14 +1881,14 @@
       // Check for anything that looks like it will be a barrier to further
       // removal
       if (State.isDSEBarrier(SI, SILoc, SILocUnd, NI, NILoc)) {
-        LLVM_DEBUG(dbgs() << "  stop, barrier\n");
-        break;
+        LLVM_DEBUG(dbgs() << "  skip, barrier\n");
+        continue;
       }
 
       // Before we try to remove anything, check for any extra throwing
       // instructions that block us from DSEing
       if (State.mayThrowBetween(SI, NI, SILocUnd)) {
-        LLVM_DEBUG(dbgs() << " stop, may throw!\n");
+        LLVM_DEBUG(dbgs() << " skip, may throw!\n");
         break;
       }
 
@@ -1857,14 +1904,14 @@
       OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset,
                                        InstWriteOffset, NI, IOL, AA, &F);
 
+      ToCheck.insert(NextDef->getDefiningAccess());
       if (OR == OW_Complete) {
         LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
                           << "\n  KILLER: " << *SI << '\n');
         State.deleteDeadInstruction(NI);
         ++NumFastStores;
         MadeChange = true;
-      } else
-        Current = NextDef;
+      }
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -165,66 +165,85 @@
     }
   };
 
-  /// Wrapper class representing a matrix as a set of column vectors.
-  /// All column vectors must have the same vector type.
-  class ColumnMatrixTy {
-    SmallVector<Value *, 16> Columns;
+  /// Wrapper class representing a matrix as a set of vectors, either in row or
+  /// column major layout. All vectors must have the same vector type.
+  class MatrixTy {
+    SmallVector<Value *, 16> Vectors;
 
     OpInfoTy OpInfo;
 
+    bool IsColumnMajor = true;
+
   public:
-    ColumnMatrixTy() : Columns() {}
-    ColumnMatrixTy(ArrayRef<Value *> Cols)
-        : Columns(Cols.begin(), Cols.end()) {}
+    MatrixTy() : Vectors() {}
+    MatrixTy(ArrayRef<Value *> Vectors)
+        : Vectors(Vectors.begin(), Vectors.end()) {}
+
+    Value *getVector(unsigned i) const { return Vectors[i]; }
+    Value *getColumn(unsigned i) const {
+      assert(isColumnMajor() && "only supported for column-major matrixes");
+      return Vectors[i];
+    }
 
-    Value *getColumn(unsigned i) const { return Columns[i]; }
+    void setColumn(unsigned i, Value *V) { Vectors[i] = V; }
 
-    void setColumn(unsigned i, Value *V) { Columns[i] = V; }
+    Type *getElementType() { return getVectorTy()->getElementType(); }
 
-    Type *getElementType() {
-      return cast<VectorType>(Columns[0]->getType())->getElementType();
+    unsigned getNumColumns() const {
+      if (isColumnMajor())
+        return Vectors.size();
+      else {
+        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
+        return cast<VectorType>(Vectors[0]->getType())->getNumElements();
+      }
     }
-
-    unsigned getNumColumns() const { return Columns.size(); }
     unsigned getNumRows() const {
-      assert(Columns.size() > 0 && "Cannot call getNumRows without columns");
-      return cast<VectorType>(Columns[0]->getType())->getNumElements();
+      if (isColumnMajor()) {
+        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
+        return cast<VectorType>(Vectors[0]->getType())->getNumElements();
+      } else
+        return Vectors.size();
     }
 
-    const SmallVectorImpl<Value *> &getColumnVectors() const { return Columns; }
+    const SmallVectorImpl<Value *> &getColumnVectors() const { return Vectors; }
 
-    SmallVectorImpl<Value *> &getColumnVectors() { return Columns; }
+    SmallVectorImpl<Value *> &getColumnVectors() { return Vectors; }
 
-    void addColumn(Value *V) { Columns.push_back(V); }
+    void addColumn(Value *V) { Vectors.push_back(V); }
 
     VectorType *getColumnTy() {
-      return cast<VectorType>(Columns[0]->getType());
+      assert(isColumnMajor() && "only supported for column-major matrixes");
+      return getVectorTy();
+    }
+
+    VectorType *getVectorTy() {
+      return cast<VectorType>(Vectors[0]->getType());
     }
 
     iterator_range<SmallVector<Value *, 8>::iterator> columns() {
-      return make_range(Columns.begin(), Columns.end());
+      return make_range(Vectors.begin(), Vectors.end());
     }
 
     /// Embed the columns of the matrix into a flat vector by concatenating
     /// them.
     Value *embedInVector(IRBuilder<> &Builder) const {
-      return Columns.size() == 1 ? Columns[0]
-                                 : concatenateVectors(Builder, Columns);
+      return Vectors.size() == 1 ? Vectors[0]
+                                 : concatenateVectors(Builder, Vectors);
     }
 
-    ColumnMatrixTy &addNumLoads(unsigned N) {
+    MatrixTy &addNumLoads(unsigned N) {
       OpInfo.NumLoads += N;
       return *this;
     }
 
     void setNumLoads(unsigned N) { OpInfo.NumLoads = N; }
 
-    ColumnMatrixTy &addNumStores(unsigned N) {
+    MatrixTy &addNumStores(unsigned N) {
       OpInfo.NumStores += N;
       return *this;
     }
 
-    ColumnMatrixTy &addNumComputeOps(unsigned N) {
+    MatrixTy &addNumComputeOps(unsigned N) {
       OpInfo.NumComputeOps += N;
       return *this;
     }
@@ -234,6 +253,8 @@
     unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; }
 
     const OpInfoTy &getOpInfo() const { return OpInfo; }
+
+    bool isColumnMajor() const { return IsColumnMajor; }
   };
 
   struct ShapeInfo {
@@ -274,7 +295,7 @@
   SmallVector<Instruction *, 16> ToRemove;
 
   /// Map from instructions to their produced column matrix.
-  MapVector<Value *, ColumnMatrixTy> Inst2ColumnMatrix;
+  MapVector<Value *, MatrixTy> Inst2ColumnMatrix;
 
 public:
   LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
@@ -300,8 +321,8 @@
   /// If we lowered \p MatrixVal, just return the cache result column matrix.
   /// Otherwie split the flat vector \p MatrixVal containing a matrix with
   /// shape \p SI into column vectors.
-  ColumnMatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,
-                           IRBuilder<> &Builder) {
+  MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,
+                     IRBuilder<> &Builder) {
     VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType());
     assert(VType && "MatrixVal must be a vector type");
     assert(VType->getNumElements() == SI.NumRows * SI.NumColumns &&
@@ -313,7 +334,7 @@
     // vector and split it later.
     auto Found = Inst2ColumnMatrix.find(MatrixVal);
     if (Found != Inst2ColumnMatrix.end()) {
-      ColumnMatrixTy &M = Found->second;
+      MatrixTy &M = Found->second;
       // Return the found matrix, if its shape matches the requested shape
       // information
       if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns())
@@ -640,11 +661,11 @@
 
   /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between
   /// columns.
-  ColumnMatrixTy loadMatrix(Type *Ty, Value *Ptr, Value *Stride,
-                            ShapeInfo Shape, IRBuilder<> &Builder) {
+  MatrixTy loadMatrix(Type *Ty, Value *Ptr, Value *Stride, ShapeInfo Shape,
+                      IRBuilder<> &Builder) {
     auto VType = cast<VectorType>(Ty);
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
-    ColumnMatrixTy Result;
+    MatrixTy Result;
     // Distance between start of one column and the start of the next
     for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) {
       Value *GEP =
@@ -659,9 +680,9 @@
 
   /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix,
   /// starting at \p MatrixPtr[I][J].
-  ColumnMatrixTy loadMatrix(Value *MatrixPtr, ShapeInfo MatrixShape, unsigned I,
-                            unsigned J, ShapeInfo ResultShape, Type *EltTy,
-                            IRBuilder<> &Builder) {
+  MatrixTy loadMatrix(Value *MatrixPtr, ShapeInfo MatrixShape, unsigned I,
+                      unsigned J, ShapeInfo ResultShape, Type *EltTy,
+                      IRBuilder<> &Builder) {
 
     Value *Offset = Builder.CreateAdd(
         Builder.CreateMul(Builder.getInt32(J),
@@ -703,7 +724,7 @@
 
   /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
   /// MatrixPtr[I][J].
-  void storeMatrix(const ColumnMatrixTy &StoreVal, Value *MatrixPtr,
+  void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr,
                    ShapeInfo MatrixShape, unsigned I, unsigned J, Type *EltTy,
                    IRBuilder<> &Builder) {
     Value *Offset = Builder.CreateAdd(
@@ -727,8 +748,8 @@
 
   /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
   /// columns.
-  ColumnMatrixTy storeMatrix(Type *Ty, ColumnMatrixTy StoreVal, Value *Ptr,
-                             Value *Stride, IRBuilder<> &Builder) {
+  MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr, Value *Stride,
+                       IRBuilder<> &Builder) {
     auto VType = cast<VectorType>(Ty);
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
     for (auto C : enumerate(StoreVal.columns())) {
@@ -737,8 +758,8 @@
                                      VType->getElementType(), Builder);
       createColumnStore(C.value(), GEP, VType->getElementType(), Builder);
     }
-    return ColumnMatrixTy().addNumStores(getNumOps(StoreVal.getColumnTy()) *
-                                         StoreVal.getNumColumns());
+    return MatrixTy().addNumStores(getNumOps(StoreVal.getColumnTy()) *
+                                   StoreVal.getNumColumns());
   }
 
   /// Lower a store instruction with shape information.
@@ -764,7 +785,7 @@
 
   /// Extract a column vector of \p NumElts starting at index (\p I, \p J) from
   /// the matrix \p LM represented as a vector of column vectors.
-  Value *extractVector(const ColumnMatrixTy &LM, unsigned I, unsigned J,
+  Value *extractVector(const MatrixTy &LM, unsigned I, unsigned J,
                        unsigned NumElts, IRBuilder<> &Builder) {
     Value *Col = LM.getColumn(J);
     Value *Undef = UndefValue::get(Col->getType());
@@ -836,7 +857,7 @@
   /// cached value when they are lowered. For other users, \p Matrix is
   /// flattened and the uses are updated to use it. Also marks \p Inst for
   /// deletion.
-  void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix,
+  void finalizeLowering(Instruction *Inst, MatrixTy Matrix,
                         IRBuilder<> &Builder) {
     Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));
 
@@ -854,9 +875,8 @@
 
   /// Compute Res += A * B for tile-sized matrices with left-associating
   /// addition.
-  void emitChainedMatrixMultiply(ColumnMatrixTy &Result,
-                                 const ColumnMatrixTy &A,
-                                 const ColumnMatrixTy &B, bool AllowContraction,
+  void emitChainedMatrixMultiply(MatrixTy &Result, const MatrixTy &A,
+                                 const MatrixTy &B, bool AllowContraction,
                                  IRBuilder<> &Builder, bool isTiled) {
     const unsigned VF = std::max<unsigned>(
         TTI.getRegisterBitWidth(true) /
@@ -902,17 +922,15 @@
     ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
     ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
 
-    const ColumnMatrixTy &Lhs =
-        getMatrix(MatMul->getArgOperand(0), LShape, Builder);
-    const ColumnMatrixTy &Rhs =
-        getMatrix(MatMul->getArgOperand(1), RShape, Builder);
+    const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);
+    const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);
 
     const unsigned R = LShape.NumRows;
     const unsigned C = RShape.NumColumns;
     assert(LShape.NumColumns == RShape.NumRows);
 
     // Initialize the output
-    ColumnMatrixTy Result;
+    MatrixTy Result;
     for (unsigned J = 0; J < C; ++J)
       Result.addColumn(UndefValue::get(VectorType::get(EltType, R)));
 
@@ -924,12 +942,12 @@
 
   /// Lowers llvm.matrix.transpose.
   void LowerTranspose(CallInst *Inst) {
-    ColumnMatrixTy Result;
+    MatrixTy Result;
     IRBuilder<> Builder(Inst);
     Value *InputVal = Inst->getArgOperand(0);
     VectorType *VectorTy = cast<VectorType>(InputVal->getType());
     ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
-    ColumnMatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
+    MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
 
     for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) {
       // Build a single column vector for this row. First initialize it.
@@ -989,11 +1007,11 @@
     IRBuilder<> Builder(Inst);
     ShapeInfo &Shape = I->second;
 
-    ColumnMatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder);
-    ColumnMatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder);
+    MatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder);
+    MatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder);
 
     // Add each column and store the result back into the opmapping
-    ColumnMatrixTy Result;
+    MatrixTy Result;
     auto BuildColumnOp = [&Builder, Inst](Value *LHS, Value *RHS) {
       switch (Inst->getOpcode()) {
       case Instruction::Add:
@@ -1035,7 +1053,7 @@
 
     /// Mapping from instructions to column matrixes. It is used to identify
     /// matrix instructions.
-    const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
+    const MapVector<Value *, MatrixTy> &Inst2ColumnMatrix;
 
     /// Mapping from values to the leaves of all expressions that the value is
     /// part of.
@@ -1052,7 +1070,7 @@
     SmallPtrSet<Value *, 8> ReusedExprs;
 
     ExprLinearizer(const DataLayout &DL,
-                   const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
+                   const MapVector<Value *, MatrixTy> &Inst2ColumnMatrix,
                    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
                    const SmallSetVector<Value *, 32> &ExprsInSubprogram,
                    Value *Leaf)
@@ -1296,12 +1314,12 @@
   ///    that multiple leaves can share sub-expressions. Shared subexpressions
   ///    are explicitly marked as shared().
   struct RemarkGenerator {
-    const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
+    const MapVector<Value *, MatrixTy> &Inst2ColumnMatrix;
     OptimizationRemarkEmitter &ORE;
     Function &Func;
     const DataLayout &DL;
 
-    RemarkGenerator(const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
+    RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2ColumnMatrix,
                     OptimizationRemarkEmitter &ORE, Function &Func)
         : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), Func(Func),
           DL(Func.getParent()->getDataLayout()) {}
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -80,10 +80,8 @@
 
 // Clone OldFunc into NewFunc, transforming the old arguments into references to
 // VMap values.
-//
 void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
-                             ValueToValueMapTy &VMap,
-                             bool ModuleLevelChanges,
+                             ValueToValueMapTy &VMap, CloneType CT,
                              SmallVectorImpl<ReturnInst*> &Returns,
                              const char *NameSuffix, ClonedCodeInfo *CodeInfo,
                              ValueMapTypeRemapper *TypeMapper,
@@ -101,12 +99,12 @@
   NewFunc->copyAttributesFrom(OldFunc);
   NewFunc->setAttributes(NewAttrs);
 
+  RemapFlags RF =
+      (CT == CloneType::ModuleLevelChanges) ? RF_None : RF_NoModuleLevelChanges;
   // Fix up the personality function that got copied over.
   if (OldFunc->hasPersonalityFn())
-    NewFunc->setPersonalityFn(
-        MapValue(OldFunc->getPersonalityFn(), VMap,
-                 ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                 TypeMapper, Materializer));
+    NewFunc->setPersonalityFn(MapValue(OldFunc->getPersonalityFn(), VMap, RF,
+                                       TypeMapper, Materializer));
 
   SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
   AttributeList OldAttrs = OldFunc->getAttributes();
@@ -123,11 +121,11 @@
       AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
                          OldAttrs.getRetAttributes(), NewArgAttrs));
 
-  bool MustCloneSP =
-      OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
+  bool MustCloneSP = CT != CloneType::ExtractingFunctions &&
+                     OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
   DISubprogram *SP = OldFunc->getSubprogram();
   if (SP) {
-    assert(!MustCloneSP || ModuleLevelChanges);
+    assert(!MustCloneSP || CT == CloneType::ModuleLevelChanges);
     // Add mappings for some DebugInfo nodes that we don't want duplicated
     // even if they're distinct.
     auto &MD = VMap.MD();
@@ -144,10 +142,7 @@
   OldFunc->getAllMetadata(MDs);
   for (auto MD : MDs) {
     NewFunc->addMetadata(
-        MD.first,
-        *MapMetadata(MD.second, VMap,
-                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                     TypeMapper, Materializer));
+        MD.first, *MapMetadata(MD.second, VMap, RF, TypeMapper, Materializer));
   }
 
   // When we remap instructions, we want to avoid duplicating inlined
@@ -167,7 +162,7 @@
 
     // Create a new basic block and copy instructions into it!
     BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
-                                      ModuleLevelChanges ? &DIFinder : nullptr);
+                                      CT == CloneType::ModuleLevelChanges ? &DIFinder : nullptr);
 
     // Add basic block mapping.
     VMap[&BB] = CBB;
@@ -207,9 +202,7 @@
        BB != BE; ++BB)
     // Loop over all instructions, fixing each one as we find it...
     for (Instruction &II : *BB)
-      RemapInstruction(&II, VMap,
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                       TypeMapper, Materializer);
+      RemapInstruction(&II, VMap, RF, TypeMapper, Materializer);
 
   // Register all DICompileUnits of the old parent module in the new parent module
   auto* OldModule = OldFunc->getParent();
@@ -262,8 +255,9 @@
     }
 
   SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-  CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "",
-                    CodeInfo);
+  CloneType CT = F->getSubprogram() != nullptr ? CloneType::ModuleLevelChanges
+                                               : CloneType::InvalidCloneType;
+  CloneFunctionInto(NewF, F, VMap, CT, Returns, "", CodeInfo);
 
   return NewF;
 }
diff --git a/llvm/lib/Transforms/Utils/CloneModule.cpp b/llvm/lib/Transforms/Utils/CloneModule.cpp
--- a/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -161,7 +161,7 @@
     }
 
     SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
-    CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns);
+    CloneFunctionInto(F, &I, VMap, CloneType::ModuleLevelChanges, Returns);
 
     if (I.hasPersonalityFn())
       F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -24,15 +24,15 @@
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 
 ; CI-DAG: v_lshr_b32_e64 v0, s32, 6
-; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
 ; CI-NOT: v_mov
 ; CI: ds_write_b32 v0, v0
+; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
 ; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]]
 ; CI-NEXT: ds_write_b32 v0, v0
 
 ; GFX9: v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-NEXT: ds_write_b32 v0, v0
 ; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-DAG: ds_write_b32 v0, v0
 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
 ; GFX9-NEXT: ds_write_b32 v0, v0
 define void @func_mov_fi_i32_offset() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@@ -115,9 +115,9 @@
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
     ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; CHECK: $sgpr28 = S_MOV_B32 8192
-    ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr3, 0, implicit $exec
+    ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr2, 0, implicit $exec
     ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31
     ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
@@ -154,9 +154,9 @@
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
     ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31
-    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; CHECK: $vcc_lo = S_MOV_B32 8192
-    ; CHECK: $vgpr2, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec
+    ; CHECK: $vgpr2, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec
     ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31
     ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@@ -28,8 +28,8 @@
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
     ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec
+    ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec
     ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
@@ -0,0 +1,60 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX9 %s
+
+# Test case where spilling a VGPR to an emergency slot is needed during frame index elimination.
+
+---
+name: pei_scavenge_vgpr_spill
+tracksRegLiveness: true
+
+stack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
+  - { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
+
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  frameOffsetReg:  $sgpr33
+  stackPtrOffsetReg:  $sgpr32
+
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255
+
+    ; GFX8-LABEL: name: pei_scavenge_vgpr_spill
+    ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
+    ; GFX8: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2
+    ; GFX8: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
+    ; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
+    ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc
+    ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
+    ; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; GFX8: $vcc_lo = S_MOV_B32 8192
+    ; GFX8: $vgpr3, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec
+    ; GFX8: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
+    ; GFX8: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
+    ; GFX8: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0
+    ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc
+    ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
+    ; GFX8: S_ENDPGM 0, csr_amdgpu_allvgprs
+    ; GFX9-LABEL: name: pei_scavenge_vgpr_spill
+    ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
+    ; GFX9: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2
+    ; GFX9: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
+    ; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
+    ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc
+    ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
+    ; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; GFX9: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec
+    ; GFX9: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
+    ; GFX9: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
+    ; GFX9: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0
+    ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc
+    ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
+    ; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs
+    $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec
+    S_ENDPGM 0, csr_amdgpu_allvgprs
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -246,7 +246,7 @@
 ; GFX908-DAG  v_accvgpr_read_b32
 
 ; GCN:    NumVgprs: 256
-; GFX900: ScratchSize: 644
+; GFX900: ScratchSize: 708
 ; GFX908-FIXME: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -24,7 +24,7 @@
 ; OFFREG is offset system SGPR
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
-; GCN: NumVgprs: 256
+; GCN: NumVgprs: 255
 ; GCN: ScratchSize: 1536
 
 define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, [16 x <4 x i32>] addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -0,0 +1,1491 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
+
+define arm_aapcs_vfpcc void @test_fadd(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fadd:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vdup.16 q0, r1
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vadd.f16 q1, q1, q0
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %B = load half, half* %BB
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
+  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fadd fast <8 x half> %wide.load, %broadcast.splat11
+  %4 = getelementptr inbounds half, half* %C, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  store <8 x half> %3, <8 x half>* %5, align 4
+  %index.next = add i32 %index, 8
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fadd_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fadd_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vdup.16 q0, r1
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vadd.f16 q1, q0, q1
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    bne .LBB1_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %B = load half, half* %BB
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
+  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fadd fast <8 x half> %broadcast.splat11, %wide.load
+  %4 = getelementptr inbounds half, half* %C, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  store <8 x half> %3, <8 x half>* %5, align 4
+  %index.next = add i32 %index, 8
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fmul(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fmul:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vdup.16 q0, r1
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vmul.f16 q1, q1, q0
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    bne .LBB2_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %B = load half, half* %BB
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
+  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fmul fast <8 x half> %wide.load, %broadcast.splat11
+  %4 = getelementptr inbounds half, half* %C, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  store <8 x half> %3, <8 x half>* %5, align 4
+  %index.next = add i32 %index, 8
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fmul_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fmul_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vdup.16 q0, r1
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vmul.f16 q1, q0, q1
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    bne .LBB3_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %B = load half, half* %BB
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
+  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fmul fast <8 x half> %broadcast.splat11, %wide.load
+  %4 = getelementptr inbounds half, half* %C, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  store <8 x half> %3, <8 x half>* %5, align 4
+  %index.next = add i32 %index, 8
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fsub(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fsub:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vdup.16 q0, r1
+; CHECK-NEXT:  .LBB4_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vsub.f16 q1, q1, q0
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    bne .LBB4_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %B = load half, half* %BB
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
+  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fsub fast <8 x half> %wide.load, %broadcast.splat11
+  %4 = getelementptr inbounds half, half* %C, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  store <8 x half> %3, <8 x half>* %5, align 4
+  %index.next = add i32 %index, 8
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fsub_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fsub_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vdup.16 q0, r1
+; CHECK-NEXT:  .LBB5_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vsub.f16 q1, q0, q1
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    bne .LBB5_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %B = load half, half* %BB
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
+  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fsub fast <8 x half> %broadcast.splat11, %wide.load
+  %4 = getelementptr inbounds half, half* %C, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  store <8 x half> %3, <8 x half>* %5, align 4
+  %index.next = add i32 %index, 8
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+
+define arm_aapcs_vfpcc void @test_fmas(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fmas:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr.w r12, [sp]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vdup.16 q0, r2
+; CHECK-NEXT:  .LBB6_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    subs.w r12, r12, #8
+; CHECK-NEXT:    vfma.f16 q3, q2, q1
+; CHECK-NEXT:    vstrb.8 q3, [r3], #16
+; CHECK-NEXT:    bne .LBB6_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %C = load half, half* %CC
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
+  %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = getelementptr inbounds half, half* %B, i32 %index
+  %4 = bitcast half* %3 to <8 x half>*
+  %wide.load12 = load <8 x half>, <8 x half>* %4, align 4
+  %5 = fmul fast <8 x half> %wide.load12, %wide.load
+  %6 = fadd fast <8 x half> %5, %broadcast.splat14
+  %7 = getelementptr inbounds half, half* %D, i32 %index
+  %8 = bitcast half* %7 to <8 x half>*
+  store <8 x half> %6, <8 x half>* %8, align 4
+  %index.next = add i32 %index, 8
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fmas_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fmas_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr.w r12, [sp]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vdup.16 q0, r2
+; CHECK-NEXT:  .LBB7_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    subs.w r12, r12, #8
+; CHECK-NEXT:    vfma.f16 q3, q2, q1
+; CHECK-NEXT:    vstrb.8 q3, [r3], #16
+; CHECK-NEXT:    bne .LBB7_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %C = load half, half* %CC
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
+  %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = getelementptr inbounds half, half* %B, i32 %index
+  %4 = bitcast half* %3 to <8 x half>*
+  %wide.load12 = load <8 x half>, <8 x half>* %4, align 4
+  %5 = fmul fast <8 x half> %wide.load12, %wide.load
+  %6 = fadd fast <8 x half> %broadcast.splat14, %5
+  %7 = getelementptr inbounds half, half* %D, i32 %index
+  %8 = bitcast half* %7 to <8 x half>*
+  store <8 x half> %6, <8 x half>* %8, align 4
+  %index.next = add i32 %index, 8
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fma(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fma:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr.w r12, [sp]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vdup.16 q0, r2
+; CHECK-NEXT:  .LBB8_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    subs.w r12, r12, #8
+; CHECK-NEXT:    vfma.f16 q2, q1, q0
+; CHECK-NEXT:    vstrb.8 q2, [r3], #16
+; CHECK-NEXT:    bne .LBB8_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %C = load half, half* %CC
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
+  %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
+  %4 = getelementptr inbounds half, half* %B, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  %wide.load14 = load <8 x half>, <8 x half>* %5, align 4
+  %6 = fadd fast <8 x half> %3, %wide.load14
+  %7 = getelementptr inbounds half, half* %D, i32 %index
+  %8 = bitcast half* %7 to <8 x half>*
+  store <8 x half> %6, <8 x half>* %8, align 4
+  %index.next = add i32 %index, 8
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fma_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fma_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr.w r12, [sp]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vdup.16 q0, r2
+; CHECK-NEXT:  .LBB9_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    subs.w r12, r12, #8
+; CHECK-NEXT:    vfma.f16 q2, q0, q1
+; CHECK-NEXT:    vstrb.8 q2, [r3], #16
+; CHECK-NEXT:    bne .LBB9_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %C = load half, half* %CC
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
+  %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
+  %4 = getelementptr inbounds half, half* %B, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  %wide.load14 = load <8 x half>, <8 x half>* %5, align 4
+  %6 = fadd fast <8 x half> %3, %wide.load14
+  %7 = getelementptr inbounds half, half* %D, i32 %index
+  %8 = bitcast half* %7 to <8 x half>*
+  store <8 x half> %6, <8 x half>* %8, align 4
+  %index.next = add i32 %index, 8
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+
+define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fmss:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr.w r12, [sp]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vdup.16 q0, r2
+; CHECK-NEXT:    vneg.f16 q0, q0
+; CHECK-NEXT:  .LBB10_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    subs.w r12, r12, #8
+; CHECK-NEXT:    vfma.f16 q3, q2, q1
+; CHECK-NEXT:    vstrb.8 q3, [r3], #16
+; CHECK-NEXT:    bne .LBB10_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %C = load half, half* %CC
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
+  %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = getelementptr inbounds half, half* %B, i32 %index
+  %4 = bitcast half* %3 to <8 x half>*
+  %wide.load12 = load <8 x half>, <8 x half>* %4, align 4
+  %5 = fmul fast <8 x half> %wide.load12, %wide.load
+  %6 = fsub fast <8 x half> %5, %broadcast.splat14
+  %7 = getelementptr inbounds half, half* %D, i32 %index
+  %8 = bitcast half* %7 to <8 x half>*
+  store <8 x half> %6, <8 x half>* %8, align 4
+  %index.next = add i32 %index, 8
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fmss_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr.w r12, [sp]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vdup.16 q0, r2
+; CHECK-NEXT:  .LBB11_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    subs.w r12, r12, #8
+; CHECK-NEXT:    vfms.f16 q3, q2, q1
+; CHECK-NEXT:    vstrb.8 q3, [r3], #16
+; CHECK-NEXT:    bne .LBB11_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %C = load half, half* %CC
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
+  %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = getelementptr inbounds half, half* %B, i32 %index
+  %4 = bitcast half* %3 to <8 x half>*
+  %wide.load12 = load <8 x half>, <8 x half>* %4, align 4
+  %5 = fmul fast <8 x half> %wide.load12, %wide.load
+  %6 = fsub fast <8 x half> %broadcast.splat14, %5
+  %7 = getelementptr inbounds half, half* %D, i32 %index
+  %8 = bitcast half* %7 to <8 x half>*
+  store <8 x half> %6, <8 x half>* %8, align 4
+  %index.next = add i32 %index, 8
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fms:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr.w r12, [sp]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vdup.16 q0, r2
+; CHECK-NEXT:  .LBB12_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    subs.w r12, r12, #8
+; CHECK-NEXT:    vneg.f16 q1, q1
+; CHECK-NEXT:    vfma.f16 q1, q2, q0
+; CHECK-NEXT:    vstrb.8 q1, [r3], #16
+; CHECK-NEXT:    bne .LBB12_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %C = load half, half* %CC
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
+  %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
+  %4 = getelementptr inbounds half, half* %B, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  %wide.load14 = load <8 x half>, <8 x half>* %5, align 4
+  %6 = fsub fast <8 x half> %3, %wide.load14
+  %7 = getelementptr inbounds half, half* %D, i32 %index
+  %8 = bitcast half* %7 to <8 x half>*
+  store <8 x half> %6, <8 x half>* %8, align 4
+  %index.next = add i32 %index, 8
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fms_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr.w r12, [sp]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vdup.16 q0, r2
+; CHECK-NEXT:  .LBB13_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    subs.w r12, r12, #8
+; CHECK-NEXT:    vneg.f16 q1, q1
+; CHECK-NEXT:    vfma.f16 q1, q0, q2
+; CHECK-NEXT:    vstrb.8 q1, [r3], #16
+; CHECK-NEXT:    bne .LBB13_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %C = load half, half* %CC
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
+  %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds half, half* %A, i32 %index
+  %2 = bitcast half* %1 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
+  %4 = getelementptr inbounds half, half* %B, i32 %index
+  %5 = bitcast half* %4 to <8 x half>*
+  %wide.load14 = load <8 x half>, <8 x half>* %5, align 4
+  %6 = fsub fast <8 x half> %3, %wide.load14
+  %7 = getelementptr inbounds half, half* %D, i32 %index
+  %8 = bitcast half* %7 to <8 x half>*
+  store <8 x half> %6, <8 x half>* %8, align 4
+  %index.next = add i32 %index, 8
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+
+define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, half *%ina) local_unnamed_addr #0 {
+; CHECK-LABEL: test_nested:
+; CHECK:       @ %bb.0: @ %for.body.us.preheader
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    ldrd lr, r12, [sp, #20]
+; CHECK-NEXT:    lsl.w r3, r12, #1
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB14_1: @ %for.body.us
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB14_2 Depth 2
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    mov r5, r12
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vdup.16 q0, r4
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:  .LBB14_2: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB14_1 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    adds r6, r0, r4
+; CHECK-NEXT:    adds r7, r2, r4
+; CHECK-NEXT:    vldrw.u32 q1, [r7]
+; CHECK-NEXT:    vldrw.u32 q2, [r6]
+; CHECK-NEXT:    adds r4, #16
+; CHECK-NEXT:    subs r5, #8
+; CHECK-NEXT:    vfms.f16 q2, q1, q0
+; CHECK-NEXT:    vstrw.32 q2, [r6]
+; CHECK-NEXT:    bne .LBB14_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond6.for.end_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB14_1 Depth=1
+; CHECK-NEXT:    add r0, r3
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    adds r1, #2
+; CHECK-NEXT:    le lr, .LBB14_1
+; CHECK-NEXT:  @ %bb.4: @ %for.end14
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+for.body.us.preheader:
+  %in = load half, half* %ina
+  %cmp = icmp sgt i32 %numRows, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp1 = icmp sgt i32 %numCols, 0
+  tail call void @llvm.assume(i1 %cmp1)
+  %rem = and i32 %numCols, 7
+  %cmp2 = icmp eq i32 %rem, 0
+  tail call void @llvm.assume(i1 %cmp2)
+  %cmp3 = icmp slt i32 %l, %numCols
+  tail call void @llvm.assume(i1 %cmp3)
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
+  %pInT1.addr.038.us = phi half* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
+  %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  %pOutT1.addr.036.us = phi half* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
+  %pPRT_in.addr.035.us = phi half* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
+  %scevgep = getelementptr half, half* %pPRT_in.addr.035.us, i32 %numCols
+  %0 = load half, half* %pOutT1.addr.036.us, align 4
+  %broadcast.splatinsert47 = insertelement <8 x half> undef, half %0, i32 0
+  %broadcast.splat48 = shufflevector <8 x half> %broadcast.splatinsert47, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %for.body.us
+  %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
+  %next.gep = getelementptr half, half* %pInT1.addr.038.us, i32 %index
+  %next.gep45 = getelementptr half, half* %pPRT_in.addr.035.us, i32 %index
+  %1 = bitcast half* %next.gep to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %1, align 4
+  %2 = bitcast half* %next.gep45 to <8 x half>*
+  %wide.load46 = load <8 x half>, <8 x half>* %2, align 4
+  %3 = fmul fast <8 x half> %wide.load46, %broadcast.splat48
+  %4 = fsub fast <8 x half> %wide.load, %3
+  store <8 x half> %4, <8 x half>* %1, align 4
+  %index.next = add i32 %index, 8
+  %5 = icmp eq i32 %index.next, %numCols
+  br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body
+
+for.cond6.for.end_crit_edge.us:                   ; preds = %vector.body
+  %incdec.ptr.us = getelementptr inbounds half, half* %pOutT1.addr.036.us, i32 1
+  %scevgep40 = getelementptr half, half* %pInT1.addr.038.us, i32 %numCols
+  %inc13.us = add nuw nsw i32 %i.037.us, 1
+  %exitcond41 = icmp eq i32 %inc13.us, %numRows
+  br i1 %exitcond41, label %for.end14, label %for.body.us
+
+for.end14:                                        ; preds = %for.cond6.for.end_crit_edge.us
+  ret void
+}
+
+%struct.arm_fir_instance_f32 = type { i16, half*, half* }
+define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* %pDst, i32 %blockSize) {
+; CHECK-LABEL: arm_fir_f32_1_4_mve:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    ldrh.w r9, [r0]
+; CHECK-NEXT:    ldr.w r12, [r0, #4]
+; CHECK-NEXT:    sub.w r7, r9, #1
+; CHECK-NEXT:    cmp r7, #3
+; CHECK-NEXT:    bhi .LBB15_6
+; CHECK-NEXT:  @ %bb.1: @ %if.then
+; CHECK-NEXT:    ldr r6, [r0, #8]
+; CHECK-NEXT:    vldr.16 s0, [r6]
+; CHECK-NEXT:    vmov lr, s0
+; CHECK-NEXT:    vldr.16 s0, [r6, #2]
+; CHECK-NEXT:    vdup.16 q3, lr
+; CHECK-NEXT:    lsr.w lr, r3, #2
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vldr.16 s0, [r6, #4]
+; CHECK-NEXT:    vdup.16 q2, r5
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vldr.16 s0, [r6, #6]
+; CHECK-NEXT:    vdup.16 q1, r4
+; CHECK-NEXT:    add.w r4, r12, r7, lsl #1
+; CHECK-NEXT:    vmov r6, s0
+; CHECK-NEXT:    vdup.16 q0, r6
+; CHECK-NEXT:    wls lr, lr, .LBB15_5
+; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
+; CHECK-NEXT:    bic r10, r3, #3
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    add.w r8, r2, r10, lsl #1
+; CHECK-NEXT:  .LBB15_3: @ %while.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    adds r5, r1, r6
+; CHECK-NEXT:    vldrw.u32 q4, [r5]
+; CHECK-NEXT:    adds r5, r4, r6
+; CHECK-NEXT:    vstrw.32 q4, [r5]
+; CHECK-NEXT:    add.w r5, r12, r6
+; CHECK-NEXT:    vldrw.u32 q4, [r5]
+; CHECK-NEXT:    adds r7, r5, #2
+; CHECK-NEXT:    vldrw.u32 q5, [r7]
+; CHECK-NEXT:    vmul.f16 q4, q4, q3
+; CHECK-NEXT:    vfma.f16 q4, q5, q2
+; CHECK-NEXT:    vldrw.u32 q5, [r5, #4]
+; CHECK-NEXT:    adds r5, #6
+; CHECK-NEXT:    vfma.f16 q4, q5, q1
+; CHECK-NEXT:    vldrw.u32 q5, [r5]
+; CHECK-NEXT:    adds r5, r2, r6
+; CHECK-NEXT:    adds r6, #8
+; CHECK-NEXT:    vfma.f16 q4, q5, q0
+; CHECK-NEXT:    vstrw.32 q4, [r5]
+; CHECK-NEXT:    le lr, .LBB15_3
+; CHECK-NEXT:  @ %bb.4: @ %while.end.loopexit
+; CHECK-NEXT:    add r4, r6
+; CHECK-NEXT:    add.w r12, r12, r10, lsl #1
+; CHECK-NEXT:    add.w r1, r1, r10, lsl #1
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:  .LBB15_5: @ %while.end
+; CHECK-NEXT:    and r7, r3, #3
+; CHECK-NEXT:    vldrw.u32 q4, [r1]
+; CHECK-NEXT:    vctp.16 r7
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q4, [r4]
+; CHECK-NEXT:    vldrw.u32 q4, [r12]
+; CHECK-NEXT:    add.w r1, r12, #2
+; CHECK-NEXT:    vmul.f16 q3, q4, q3
+; CHECK-NEXT:    vldrw.u32 q4, [r1]
+; CHECK-NEXT:    add.w r1, r12, #6
+; CHECK-NEXT:    vfma.f16 q3, q4, q2
+; CHECK-NEXT:    vldrw.u32 q2, [r12, #4]
+; CHECK-NEXT:    vfma.f16 q3, q2, q1
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vfma.f16 q3, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q3, [r2]
+; CHECK-NEXT:    ldr.w r12, [r0, #4]
+; CHECK-NEXT:  .LBB15_6: @ %if.end
+; CHECK-NEXT:    add.w r0, r12, r3, lsl #1
+; CHECK-NEXT:    lsr.w lr, r9, #2
+; CHECK-NEXT:    wls lr, lr, .LBB15_10
+; CHECK-NEXT:  @ %bb.7: @ %while.body51.preheader
+; CHECK-NEXT:    bic r2, r9, #3
+; CHECK-NEXT:    adds r1, r2, r3
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    add.w r1, r12, r1, lsl #1
+; CHECK-NEXT:  .LBB15_8: @ %while.body51
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
+; CHECK-NEXT:    vstrb.8 q0, [r3], #8
+; CHECK-NEXT:    le lr, .LBB15_8
+; CHECK-NEXT:  @ %bb.9: @ %while.end55.loopexit
+; CHECK-NEXT:    add.w r12, r12, r2, lsl #1
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:  .LBB15_10: @ %while.end55
+; CHECK-NEXT:    ands r1, r9, #3
+; CHECK-NEXT:    beq .LBB15_12
+; CHECK-NEXT:  @ %bb.11: @ %if.then59
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vctp.16 r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r12]
+; CHECK-NEXT:  .LBB15_12: @ %if.end61
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+entry:
+  %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
+  %0 = load half*, half** %pState1, align 4
+  %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
+  %1 = load half*, half** %pCoeffs2, align 4
+  %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
+  %2 = load i16, i16* %numTaps3, align 4
+  %conv = zext i16 %2 to i32
+  %sub = add nsw i32 %conv, -1
+  %cmp = icmp ult i32 %sub, 4
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds half, half* %0, i32 %sub
+  %incdec.ptr = getelementptr inbounds half, half* %1, i32 1
+  %3 = load half, half* %1, align 4
+  %incdec.ptr6 = getelementptr inbounds half, half* %1, i32 2
+  %4 = load half, half* %incdec.ptr, align 4
+  %incdec.ptr7 = getelementptr inbounds half, half* %1, i32 3
+  %5 = load half, half* %incdec.ptr6, align 4
+  %6 = load half, half* %incdec.ptr7, align 4
+  %shr = lshr i32 %blockSize, 2
+  %cmp9146 = icmp eq i32 %shr, 0
+  %.pre161 = insertelement <8 x half> undef, half %3, i32 0
+  %.pre162 = shufflevector <8 x half> %.pre161, <8 x half> undef, <8 x i32> zeroinitializer
+  %.pre163 = insertelement <8 x half> undef, half %4, i32 0
+  %.pre164 = shufflevector <8 x half> %.pre163, <8 x half> undef, <8 x i32> zeroinitializer
+  %.pre165 = insertelement <8 x half> undef, half %5, i32 0
+  %.pre166 = shufflevector <8 x half> %.pre165, <8 x half> undef, <8 x i32> zeroinitializer
+  %.pre167 = insertelement <8 x half> undef, half %6, i32 0
+  %.pre168 = shufflevector <8 x half> %.pre167, <8 x half> undef, <8 x i32> zeroinitializer
+  br i1 %cmp9146, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %if.then
+  %7 = and i32 %blockSize, -4
+  %scevgep158 = getelementptr half, half* %pDst, i32 %7
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %pStateCur.0151 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+  %pSamples.0150 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
+  %pOutput.0149 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
+  %pTempSrc.0148 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
+  %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
+  %8 = bitcast half* %pTempSrc.0148 to <8 x half>*
+  %9 = load <8 x half>, <8 x half>* %8, align 4
+  %10 = bitcast half* %pStateCur.0151 to <8 x half>*
+  store <8 x half> %9, <8 x half>* %10, align 4
+  %add.ptr = getelementptr inbounds half, half* %pStateCur.0151, i32 4
+  %add.ptr11 = getelementptr inbounds half, half* %pTempSrc.0148, i32 4
+  %11 = bitcast half* %pSamples.0150 to <8 x half>*
+  %12 = load <8 x half>, <8 x half>* %11, align 4
+  %13 = fmul fast <8 x half> %12, %.pre162
+  %arrayidx12 = getelementptr inbounds half, half* %pSamples.0150, i32 1
+  %14 = bitcast half* %arrayidx12 to <8 x half>*
+  %15 = load <8 x half>, <8 x half>* %14, align 4
+  %mul = fmul fast <8 x half> %15, %.pre164
+  %add = fadd fast <8 x half> %mul, %13
+  %arrayidx13 = getelementptr inbounds half, half* %pSamples.0150, i32 2
+  %16 = bitcast half* %arrayidx13 to <8 x half>*
+  %17 = load <8 x half>, <8 x half>* %16, align 4
+  %mul16 = fmul fast <8 x half> %17, %.pre166
+  %add17 = fadd fast <8 x half> %add, %mul16
+  %arrayidx18 = getelementptr inbounds half, half* %pSamples.0150, i32 3
+  %18 = bitcast half* %arrayidx18 to <8 x half>*
+  %19 = load <8 x half>, <8 x half>* %18, align 4
+  %mul21 = fmul fast <8 x half> %19, %.pre168
+  %add22 = fadd fast <8 x half> %add17, %mul21
+  %20 = bitcast half* %pOutput.0149 to <8 x half>*
+  store <8 x half> %add22, <8 x half>* %20, align 4
+  %add.ptr23 = getelementptr inbounds half, half* %pOutput.0149, i32 4
+  %add.ptr24 = getelementptr inbounds half, half* %pSamples.0150, i32 4
+  %dec = add nsw i32 %blkCnt.0147, -1
+  %cmp9 = icmp eq i32 %dec, 0
+  br i1 %cmp9, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %scevgep157 = getelementptr half, half* %pSrc, i32 %7
+  %scevgep159 = getelementptr half, half* %0, i32 %7
+  br label %while.end
+
+while.end:                                        ; preds = %if.then, %while.end.loopexit
+  %pTempSrc.0.lcssa = phi half* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
+  %pOutput.0.lcssa = phi half* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
+  %pSamples.0.lcssa = phi half* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ]
+  %pStateCur.0.lcssa = phi half* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
+  %and = and i32 %blockSize, 3
+  %21 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and)
+  %22 = bitcast half* %pTempSrc.0.lcssa to <8 x half>*
+  %23 = load <8 x half>, <8 x half>* %22, align 4
+  %24 = bitcast half* %pStateCur.0.lcssa to <8 x half>*
+  tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %23, <8 x half>* %24, i32 4, <8 x i1> %21)
+  %25 = bitcast half* %pSamples.0.lcssa to <8 x half>*
+  %26 = load <8 x half>, <8 x half>* %25, align 4
+  %27 = fmul fast <8 x half> %26, %.pre162
+  %arrayidx29 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 1
+  %28 = bitcast half* %arrayidx29 to <8 x half>*
+  %29 = load <8 x half>, <8 x half>* %28, align 4
+  %mul32 = fmul fast <8 x half> %29, %.pre164
+  %add33 = fadd fast <8 x half> %mul32, %27
+  %arrayidx34 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 2
+  %30 = bitcast half* %arrayidx34 to <8 x half>*
+  %31 = load <8 x half>, <8 x half>* %30, align 4
+  %mul37 = fmul fast <8 x half> %31, %.pre166
+  %add38 = fadd fast <8 x half> %add33, %mul37
+  %arrayidx39 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 3
+  %32 = bitcast half* %arrayidx39 to <8 x half>*
+  %33 = load <8 x half>, <8 x half>* %32, align 4
+  %mul42 = fmul fast <8 x half> %33, %.pre168
+  %add43 = fadd fast <8 x half> %add38, %mul42
+  %34 = bitcast half* %pOutput.0.lcssa to <8 x half>*
+  tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %add43, <8 x half>* %34, i32 4, <8 x i1> %21)
+  %.pre = load half*, half** %pState1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %while.end, %entry
+  %35 = phi half* [ %.pre, %while.end ], [ %0, %entry ]
+  %arrayidx45 = getelementptr inbounds half, half* %35, i32 %blockSize
+  %shr47 = lshr i32 %conv, 2
+  %cmp49141 = icmp eq i32 %shr47, 0
+  br i1 %cmp49141, label %while.end55, label %while.body51.preheader
+
+while.body51.preheader:                           ; preds = %if.end
+  %36 = and i32 %conv, 65532
+  %37 = add i32 %36, %blockSize
+  %scevgep = getelementptr half, half* %35, i32 %37
+  br label %while.body51
+
+while.body51:                                     ; preds = %while.body51.preheader, %while.body51
+  %pTempSrc.1144 = phi half* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
+  %pTempDest.0143 = phi half* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ]
+  %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
+  %38 = bitcast half* %pTempSrc.1144 to <8 x half>*
+  %39 = load <8 x half>, <8 x half>* %38, align 4
+  %40 = bitcast half* %pTempDest.0143 to <8 x half>*
+  store <8 x half> %39, <8 x half>* %40, align 4
+  %add.ptr52 = getelementptr inbounds half, half* %pTempSrc.1144, i32 4
+  %add.ptr53 = getelementptr inbounds half, half* %pTempDest.0143, i32 4
+  %dec54 = add nsw i32 %blkCnt.1142, -1
+  %cmp49 = icmp eq i32 %dec54, 0
+  br i1 %cmp49, label %while.end55.loopexit, label %while.body51
+
+while.end55.loopexit:                             ; preds = %while.body51
+  %scevgep156 = getelementptr half, half* %35, i32 %36
+  br label %while.end55
+
+while.end55:                                      ; preds = %while.end55.loopexit, %if.end
+  %pTempDest.0.lcssa = phi half* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
+  %pTempSrc.1.lcssa = phi half* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
+  %and56 = and i32 %conv, 3
+  %cmp57 = icmp eq i32 %and56, 0
+  br i1 %cmp57, label %if.end61, label %if.then59
+
+if.then59:                                        ; preds = %while.end55
+  %41 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and56)
+  %42 = bitcast half* %pTempSrc.1.lcssa to <8 x half>*
+  %43 = load <8 x half>, <8 x half>* %42, align 4
+  %44 = bitcast half* %pTempDest.0.lcssa to <8 x half>*
+  tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %43, <8 x half>* %44, i32 4, <8 x i1> %41)
+  br label %if.end61
+
+if.end61:                                         ; preds = %while.end55, %if.then59
+  ret void
+}
+
+
+define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %blockSize) {
+; CHECK-LABEL: fir:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    cmp r3, #8
+; CHECK-NEXT:    blo.w .LBB16_12
+; CHECK-NEXT:  @ %bb.1: @ %if.then
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    cmp.w r7, r3, lsr #2
+; CHECK-NEXT:    beq.w .LBB16_12
+; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
+; CHECK-NEXT:    ldrh.w r11, [r0]
+; CHECK-NEXT:    mov.w r8, #1
+; CHECK-NEXT:    ldrd r5, r12, [r0, #4]
+; CHECK-NEXT:    lsrs r3, r3, #2
+; CHECK-NEXT:    sub.w r0, r11, #8
+; CHECK-NEXT:    and r10, r0, #7
+; CHECK-NEXT:    add.w r7, r0, r0, lsr #29
+; CHECK-NEXT:    add.w r0, r10, #1
+; CHECK-NEXT:    asrs r6, r7, #3
+; CHECK-NEXT:    cmp r6, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    asrgt.w r8, r7, #3
+; CHECK-NEXT:    add.w r7, r5, r11, lsl #1
+; CHECK-NEXT:    subs r4, r7, #2
+; CHECK-NEXT:    rsb.w r7, r11, #0
+; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r7, r12, #16
+; CHECK-NEXT:    str r7, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:  .LBB16_3: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    subs r3, #1
+; CHECK-NEXT:    vstrb.8 q0, [r2], #8
+; CHECK-NEXT:    add.w r0, r9, r0, lsl #1
+; CHECK-NEXT:    add.w r5, r0, #8
+; CHECK-NEXT:    beq.w .LBB16_12
+; CHECK-NEXT:  .LBB16_4: @ %while.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #8
+; CHECK-NEXT:    vldr.16 s7, [r12]
+; CHECK-NEXT:    vldr.16 s4, [r12, #14]
+; CHECK-NEXT:    vldr.16 s6, [r12, #12]
+; CHECK-NEXT:    vldr.16 s8, [r12, #10]
+; CHECK-NEXT:    vldr.16 s10, [r12, #8]
+; CHECK-NEXT:    vldr.16 s12, [r12, #6]
+; CHECK-NEXT:    vldr.16 s14, [r12, #4]
+; CHECK-NEXT:    vldr.16 s5, [r12, #2]
+; CHECK-NEXT:    vstrb.8 q0, [r4], #8
+; CHECK-NEXT:    vldrw.u32 q0, [r5]
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    adds r6, r5, #2
+; CHECK-NEXT:    add.w r9, r5, #16
+; CHECK-NEXT:    vmul.f16 q0, q0, r0
+; CHECK-NEXT:    vldrw.u32 q4, [r6]
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    adds r6, r5, #6
+; CHECK-NEXT:    vfma.f16 q0, q4, r0
+; CHECK-NEXT:    vldrw.u32 q4, [r5, #4]
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    cmp.w r11, #16
+; CHECK-NEXT:    vfma.f16 q0, q4, r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r6]
+; CHECK-NEXT:    add.w r6, r5, #10
+; CHECK-NEXT:    vfma.f16 q0, q3, r0
+; CHECK-NEXT:    vldrw.u32 q3, [r5, #8]
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vfma.f16 q0, q3, r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r6]
+; CHECK-NEXT:    add.w r6, r5, #14
+; CHECK-NEXT:    vfma.f16 q0, q2, r0
+; CHECK-NEXT:    vldrw.u32 q2, [r5, #12]
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vfma.f16 q0, q2, r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r6]
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    blo .LBB16_8
+; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    dls lr, r8
+; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:  .LBB16_6: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldr.16 s4, [r6]
+; CHECK-NEXT:    add.w r5, r9, #2
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r9]
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    vldr.16 s4, [r6, #2]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r5]
+; CHECK-NEXT:    add.w r5, r9, #6
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    vldr.16 s4, [r6, #4]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r9, #4]
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    vldr.16 s4, [r6, #6]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r5]
+; CHECK-NEXT:    add.w r5, r9, #10
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    vldr.16 s4, [r6, #8]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r9, #8]
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    vldr.16 s4, [r6, #10]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r5]
+; CHECK-NEXT:    add.w r5, r9, #14
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    vldr.16 s4, [r6, #12]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r9, #12]
+; CHECK-NEXT:    add.w r9, r9, #16
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    vldr.16 s4, [r6, #14]
+; CHECK-NEXT:    adds r6, #16
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r5]
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    le lr, .LBB16_6
+; CHECK-NEXT:  @ %bb.7: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    bne .LBB16_9
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_8: @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    beq.w .LBB16_3
+; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    mov r5, r9
+; CHECK-NEXT:  .LBB16_10: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldr.16 s4, [r6]
+; CHECK-NEXT:    subs r0, #1
+; CHECK-NEXT:    adds r6, #2
+; CHECK-NEXT:    cmp r0, #1
+; CHECK-NEXT:    vmov r7, s4
+; CHECK-NEXT:    vldrh.u16 q1, [r5], #2
+; CHECK-NEXT:    vfma.f16 q0, q1, r7
+; CHECK-NEXT:    bgt .LBB16_10
+; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    add.w r9, r9, r10, lsl #1
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_12: @ %if.end
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+  %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
+  %0 = load half*, half** %pState1, align 4
+  %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
+  %1 = load half*, half** %pCoeffs2, align 4
+  %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
+  %2 = load i16, i16* %numTaps3, align 4
+  %conv = zext i16 %2 to i32
+  %cmp = icmp ugt i32 %blockSize, 7
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %shr = lshr i32 %blockSize, 2
+  %cmp5217 = icmp eq i32 %shr, 0
+  br i1 %cmp5217, label %if.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %if.then
+  %sub = add nsw i32 %conv, -1
+  %arrayidx = getelementptr inbounds half, half* %0, i32 %sub
+  %incdec.ptr = getelementptr inbounds half, half* %1, i32 1
+  %incdec.ptr7 = getelementptr inbounds half, half* %1, i32 2
+  %incdec.ptr8 = getelementptr inbounds half, half* %1, i32 3
+  %incdec.ptr9 = getelementptr inbounds half, half* %1, i32 4
+  %incdec.ptr10 = getelementptr inbounds half, half* %1, i32 5
+  %incdec.ptr11 = getelementptr inbounds half, half* %1, i32 6
+  %incdec.ptr12 = getelementptr inbounds half, half* %1, i32 7
+  %sub37 = add nsw i32 %conv, -8
+  %div = sdiv i32 %sub37, 8
+  %pCoeffsCur.0199 = getelementptr inbounds half, half* %1, i32 8
+  %cmp38201 = icmp ugt i16 %2, 15
+  %and = and i32 %sub37, 7
+  %cmp74210 = icmp eq i32 %and, 0
+  %idx.neg = sub nsw i32 0, %conv
+  %3 = icmp sgt i32 %div, 1
+  %smax = select i1 %3, i32 %div, i32 1
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.end
+  %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
+  %pStateCur.0221 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
+  %pSamples.0220 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
+  %pTempSrc.0219 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
+  %pOutput.0218 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
+  %4 = load half, half* %1, align 4
+  %5 = load half, half* %incdec.ptr, align 4
+  %6 = load half, half* %incdec.ptr7, align 4
+  %7 = load half, half* %incdec.ptr8, align 4
+  %8 = load half, half* %incdec.ptr9, align 4
+  %9 = load half, half* %incdec.ptr10, align 4
+  %10 = load half, half* %incdec.ptr11, align 4
+  %11 = load half, half* %incdec.ptr12, align 4
+  %12 = bitcast half* %pTempSrc.0219 to <8 x half>*
+  %13 = load <8 x half>, <8 x half>* %12, align 4
+  %14 = bitcast half* %pStateCur.0221 to <8 x half>*
+  store <8 x half> %13, <8 x half>* %14, align 4
+  %add.ptr = getelementptr inbounds half, half* %pStateCur.0221, i32 4
+  %add.ptr14 = getelementptr inbounds half, half* %pTempSrc.0219, i32 4
+  %15 = bitcast half* %pSamples.0220 to <8 x half>*
+  %16 = load <8 x half>, <8 x half>* %15, align 4
+  %.splatinsert = insertelement <8 x half> undef, half %4, i32 0
+  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
+  %17 = fmul fast <8 x half> %16, %.splat
+  %arrayidx15 = getelementptr inbounds half, half* %pSamples.0220, i32 1
+  %18 = bitcast half* %arrayidx15 to <8 x half>*
+  %19 = load <8 x half>, <8 x half>* %18, align 4
+  %.splatinsert16 = insertelement <8 x half> undef, half %5, i32 0
+  %.splat17 = shufflevector <8 x half> %.splatinsert16, <8 x half> undef, <8 x i32> zeroinitializer
+  %20 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %19, <8 x half> %.splat17, <8 x half> %17)
+  %arrayidx18 = getelementptr inbounds half, half* %pSamples.0220, i32 2
+  %21 = bitcast half* %arrayidx18 to <8 x half>*
+  %22 = load <8 x half>, <8 x half>* %21, align 4
+  %.splatinsert19 = insertelement <8 x half> undef, half %6, i32 0
+  %.splat20 = shufflevector <8 x half> %.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
+  %23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %22, <8 x half> %.splat20, <8 x half> %20)
+  %arrayidx21 = getelementptr inbounds half, half* %pSamples.0220, i32 3
+  %24 = bitcast half* %arrayidx21 to <8 x half>*
+  %25 = load <8 x half>, <8 x half>* %24, align 4
+  %.splatinsert22 = insertelement <8 x half> undef, half %7, i32 0
+  %.splat23 = shufflevector <8 x half> %.splatinsert22, <8 x half> undef, <8 x i32> zeroinitializer
+  %26 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %25, <8 x half> %.splat23, <8 x half> %23)
+  %arrayidx24 = getelementptr inbounds half, half* %pSamples.0220, i32 4
+  %27 = bitcast half* %arrayidx24 to <8 x half>*
+  %28 = load <8 x half>, <8 x half>* %27, align 4
+  %.splatinsert25 = insertelement <8 x half> undef, half %8, i32 0
+  %.splat26 = shufflevector <8 x half> %.splatinsert25, <8 x half> undef, <8 x i32> zeroinitializer
+  %29 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %28, <8 x half> %.splat26, <8 x half> %26)
+  %arrayidx27 = getelementptr inbounds half, half* %pSamples.0220, i32 5
+  %30 = bitcast half* %arrayidx27 to <8 x half>*
+  %31 = load <8 x half>, <8 x half>* %30, align 4
+  %.splatinsert28 = insertelement <8 x half> undef, half %9, i32 0
+  %.splat29 = shufflevector <8 x half> %.splatinsert28, <8 x half> undef, <8 x i32> zeroinitializer
+  %32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %31, <8 x half> %.splat29, <8 x half> %29)
+  %arrayidx30 = getelementptr inbounds half, half* %pSamples.0220, i32 6
+  %33 = bitcast half* %arrayidx30 to <8 x half>*
+  %34 = load <8 x half>, <8 x half>* %33, align 4
+  %.splatinsert31 = insertelement <8 x half> undef, half %10, i32 0
+  %.splat32 = shufflevector <8 x half> %.splatinsert31, <8 x half> undef, <8 x i32> zeroinitializer
+  %35 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %34, <8 x half> %.splat32, <8 x half> %32)
+  %arrayidx33 = getelementptr inbounds half, half* %pSamples.0220, i32 7
+  %36 = bitcast half* %arrayidx33 to <8 x half>*
+  %37 = load <8 x half>, <8 x half>* %36, align 4
+  %.splatinsert34 = insertelement <8 x half> undef, half %11, i32 0
+  %.splat35 = shufflevector <8 x half> %.splatinsert34, <8 x half> undef, <8 x i32> zeroinitializer
+  %38 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %37, <8 x half> %.splat35, <8 x half> %35)
+  %pSamples.1200 = getelementptr inbounds half, half* %pSamples.0220, i32 8
+  br i1 %cmp38201, label %for.body, label %for.end
+
+for.body:                                         ; preds = %while.body, %for.body
+  %pSamples.1207 = phi half* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
+  %pCoeffsCur.0206 = phi half* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
+  %.pn205 = phi half* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ]
+  %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
+  %vecAcc0.0203 = phi <8 x half> [ %70, %for.body ], [ %38, %while.body ]
+  %pSamples.0.pn202 = phi half* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
+  %incdec.ptr40 = getelementptr inbounds half, half* %.pn205, i32 9
+  %39 = load half, half* %pCoeffsCur.0206, align 4
+  %incdec.ptr41 = getelementptr inbounds half, half* %.pn205, i32 10
+  %40 = load half, half* %incdec.ptr40, align 4
+  %incdec.ptr42 = getelementptr inbounds half, half* %.pn205, i32 11
+  %41 = load half, half* %incdec.ptr41, align 4
+  %incdec.ptr43 = getelementptr inbounds half, half* %.pn205, i32 12
+  %42 = load half, half* %incdec.ptr42, align 4
+  %incdec.ptr44 = getelementptr inbounds half, half* %.pn205, i32 13
+  %43 = load half, half* %incdec.ptr43, align 4
+  %incdec.ptr45 = getelementptr inbounds half, half* %.pn205, i32 14
+  %44 = load half, half* %incdec.ptr44, align 4
+  %incdec.ptr46 = getelementptr inbounds half, half* %.pn205, i32 15
+  %45 = load half, half* %incdec.ptr45, align 4
+  %46 = load half, half* %incdec.ptr46, align 4
+  %47 = bitcast half* %pSamples.1207 to <8 x half>*
+  %48 = load <8 x half>, <8 x half>* %47, align 4
+  %.splatinsert48 = insertelement <8 x half> undef, half %39, i32 0
+  %.splat49 = shufflevector <8 x half> %.splatinsert48, <8 x half> undef, <8 x i32> zeroinitializer
+  %49 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %48, <8 x half> %.splat49, <8 x half> %vecAcc0.0203)
+  %arrayidx50 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 9
+  %50 = bitcast half* %arrayidx50 to <8 x half>*
+  %51 = load <8 x half>, <8 x half>* %50, align 4
+  %.splatinsert51 = insertelement <8 x half> undef, half %40, i32 0
+  %.splat52 = shufflevector <8 x half> %.splatinsert51, <8 x half> undef, <8 x i32> zeroinitializer
+  %52 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %51, <8 x half> %.splat52, <8 x half> %49)
+  %arrayidx53 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 10
+  %53 = bitcast half* %arrayidx53 to <8 x half>*
+  %54 = load <8 x half>, <8 x half>* %53, align 4
+  %.splatinsert54 = insertelement <8 x half> undef, half %41, i32 0
+  %.splat55 = shufflevector <8 x half> %.splatinsert54, <8 x half> undef, <8 x i32> zeroinitializer
+  %55 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %54, <8 x half> %.splat55, <8 x half> %52)
+  %arrayidx56 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 11
+  %56 = bitcast half* %arrayidx56 to <8 x half>*
+  %57 = load <8 x half>, <8 x half>* %56, align 4
+  %.splatinsert57 = insertelement <8 x half> undef, half %42, i32 0
+  %.splat58 = shufflevector <8 x half> %.splatinsert57, <8 x half> undef, <8 x i32> zeroinitializer
+  %58 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %57, <8 x half> %.splat58, <8 x half> %55)
+  %arrayidx59 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 12
+  %59 = bitcast half* %arrayidx59 to <8 x half>*
+  %60 = load <8 x half>, <8 x half>* %59, align 4
+  %.splatinsert60 = insertelement <8 x half> undef, half %43, i32 0
+  %.splat61 = shufflevector <8 x half> %.splatinsert60, <8 x half> undef, <8 x i32> zeroinitializer
+  %61 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %60, <8 x half> %.splat61, <8 x half> %58)
+  %arrayidx62 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 13
+  %62 = bitcast half* %arrayidx62 to <8 x half>*
+  %63 = load <8 x half>, <8 x half>* %62, align 4
+  %.splatinsert63 = insertelement <8 x half> undef, half %44, i32 0
+  %.splat64 = shufflevector <8 x half> %.splatinsert63, <8 x half> undef, <8 x i32> zeroinitializer
+  %64 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %63, <8 x half> %.splat64, <8 x half> %61)
+  %arrayidx65 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 14
+  %65 = bitcast half* %arrayidx65 to <8 x half>*
+  %66 = load <8 x half>, <8 x half>* %65, align 4
+  %.splatinsert66 = insertelement <8 x half> undef, half %45, i32 0
+  %.splat67 = shufflevector <8 x half> %.splatinsert66, <8 x half> undef, <8 x i32> zeroinitializer
+  %67 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %66, <8 x half> %.splat67, <8 x half> %64)
+  %arrayidx68 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 15
+  %68 = bitcast half* %arrayidx68 to <8 x half>*
+  %69 = load <8 x half>, <8 x half>* %68, align 4
+  %.splatinsert69 = insertelement <8 x half> undef, half %46, i32 0
+  %.splat70 = shufflevector <8 x half> %.splatinsert69, <8 x half> undef, <8 x i32> zeroinitializer
+  %70 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %69, <8 x half> %.splat70, <8 x half> %67)
+  %inc = add nuw nsw i32 %i.0204, 1
+  %pCoeffsCur.0 = getelementptr inbounds half, half* %pCoeffsCur.0206, i32 8
+  %pSamples.1 = getelementptr inbounds half, half* %pSamples.1207, i32 8
+  %exitcond = icmp eq i32 %inc, %smax
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %while.body
+  %vecAcc0.0.lcssa = phi <8 x half> [ %38, %while.body ], [ %70, %for.body ]
+  %pCoeffsCur.0.lcssa = phi half* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
+  %pSamples.1.lcssa = phi half* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
+  br i1 %cmp74210, label %while.end, label %while.body76
+
+while.body76:                                     ; preds = %for.end, %while.body76
+  %pCoeffsCur.1214 = phi half* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
+  %vecAcc0.1213 = phi <8 x half> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
+  %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
+  %pSamples.2211 = phi half* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
+  %incdec.ptr77 = getelementptr inbounds half, half* %pCoeffsCur.1214, i32 1
+  %71 = load half, half* %pCoeffsCur.1214, align 4
+  %72 = bitcast half* %pSamples.2211 to <8 x half>*
+  %73 = load <8 x half>, <8 x half>* %72, align 4
+  %.splatinsert78 = insertelement <8 x half> undef, half %71, i32 0
+  %.splat79 = shufflevector <8 x half> %.splatinsert78, <8 x half> undef, <8 x i32> zeroinitializer
+  %74 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %73, <8 x half> %.splat79, <8 x half> %vecAcc0.1213)
+  %incdec.ptr80 = getelementptr inbounds half, half* %pSamples.2211, i32 1
+  %dec = add nsw i32 %numCnt.0212, -1
+  %cmp74 = icmp sgt i32 %numCnt.0212, 1
+  br i1 %cmp74, label %while.body76, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body76
+  %scevgep = getelementptr half, half* %pSamples.1.lcssa, i32 %and
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %for.end
+  %pSamples.2.lcssa = phi half* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
+  %vecAcc0.1.lcssa = phi <8 x half> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ]
+  %75 = bitcast half* %pOutput.0218 to <8 x half>*
+  store <8 x half> %vecAcc0.1.lcssa, <8 x half>* %75, align 4
+  %add.ptr81 = getelementptr inbounds half, half* %pOutput.0218, i32 4
+  %add.ptr82 = getelementptr inbounds half, half* %pSamples.2.lcssa, i32 4
+  %add.ptr83 = getelementptr inbounds half, half* %add.ptr82, i32 %idx.neg
+  %dec84 = add nsw i32 %blkCnt.0222, -1
+  %cmp5 = icmp eq i32 %dec84, 0
+  br i1 %cmp5, label %if.end, label %while.body
+
+if.end:                                           ; preds = %while.end, %if.then, %entry
+  ret void
+}
+
+declare void @llvm.assume(i1)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -0,0 +1,1456 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
+
+define arm_aapcs_vfpcc void @test_fadd(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fadd:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.f32 q1, q1, q0
+; CHECK-NEXT:    vstrb.8 q1, [r1], #16
+; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
+  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fadd fast <4 x float> %wide.load, %broadcast.splat11
+  %4 = getelementptr inbounds float, float* %C, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  store <4 x float> %3, <4 x float>* %5, align 4
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fadd_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fadd_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.f32 q1, q0, q1
+; CHECK-NEXT:    vstrb.8 q1, [r1], #16
+; CHECK-NEXT:    bne .LBB1_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
+  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fadd fast <4 x float> %broadcast.splat11, %wide.load
+  %4 = getelementptr inbounds float, float* %C, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  store <4 x float> %3, <4 x float>* %5, align 4
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fmul(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fmul:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vmul.f32 q1, q1, q0
+; CHECK-NEXT:    vstrb.8 q1, [r1], #16
+; CHECK-NEXT:    bne .LBB2_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
+  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fmul fast <4 x float> %wide.load, %broadcast.splat11
+  %4 = getelementptr inbounds float, float* %C, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  store <4 x float> %3, <4 x float>* %5, align 4
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fmul_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fmul_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:  .LBB3_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vmul.f32 q1, q0, q1
+; CHECK-NEXT:    vstrb.8 q1, [r1], #16
+; CHECK-NEXT:    bne .LBB3_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
+  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fmul fast <4 x float> %broadcast.splat11, %wide.load
+  %4 = getelementptr inbounds float, float* %C, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  store <4 x float> %3, <4 x float>* %5, align 4
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fsub(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fsub:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:  .LBB4_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vsub.f32 q1, q1, q0
+; CHECK-NEXT:    vstrb.8 q1, [r1], #16
+; CHECK-NEXT:    bne .LBB4_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
+  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fsub fast <4 x float> %wide.load, %broadcast.splat11
+  %4 = getelementptr inbounds float, float* %C, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  store <4 x float> %3, <4 x float>* %5, align 4
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fsub_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
+; CHECK-LABEL: test_fsub_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vdup.32 q0, r3
+; CHECK-NEXT:  .LBB5_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vsub.f32 q1, q0, q1
+; CHECK-NEXT:    vstrb.8 q1, [r1], #16
+; CHECK-NEXT:    bne .LBB5_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
+  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fsub fast <4 x float> %broadcast.splat11, %wide.load
+  %4 = getelementptr inbounds float, float* %C, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  store <4 x float> %3, <4 x float>* %5, align 4
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+
+define arm_aapcs_vfpcc void @test_fmas(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fmas:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
+; CHECK-NEXT:  .LBB6_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vfma.f32 q3, q2, q1
+; CHECK-NEXT:    vstrb.8 q3, [r2], #16
+; CHECK-NEXT:    bne .LBB6_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
+  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = getelementptr inbounds float, float* %B, i32 %index
+  %4 = bitcast float* %3 to <4 x float>*
+  %wide.load12 = load <4 x float>, <4 x float>* %4, align 4
+  %5 = fmul fast <4 x float> %wide.load12, %wide.load
+  %6 = fadd fast <4 x float> %5, %broadcast.splat14
+  %7 = getelementptr inbounds float, float* %D, i32 %index
+  %8 = bitcast float* %7 to <4 x float>*
+  store <4 x float> %6, <4 x float>* %8, align 4
+  %index.next = add i32 %index, 4
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fmas_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fmas_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
+; CHECK-NEXT:  .LBB7_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vfma.f32 q3, q2, q1
+; CHECK-NEXT:    vstrb.8 q3, [r2], #16
+; CHECK-NEXT:    bne .LBB7_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
+  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = getelementptr inbounds float, float* %B, i32 %index
+  %4 = bitcast float* %3 to <4 x float>*
+  %wide.load12 = load <4 x float>, <4 x float>* %4, align 4
+  %5 = fmul fast <4 x float> %wide.load12, %wide.load
+  %6 = fadd fast <4 x float> %broadcast.splat14, %5
+  %7 = getelementptr inbounds float, float* %D, i32 %index
+  %8 = bitcast float* %7 to <4 x float>*
+  store <4 x float> %6, <4 x float>* %8, align 4
+  %index.next = add i32 %index, 4
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fma(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fma:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
+; CHECK-NEXT:  .LBB8_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vstrb.8 q2, [r2], #16
+; CHECK-NEXT:    bne .LBB8_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
+  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
+  %4 = getelementptr inbounds float, float* %B, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  %wide.load14 = load <4 x float>, <4 x float>* %5, align 4
+  %6 = fadd fast <4 x float> %3, %wide.load14
+  %7 = getelementptr inbounds float, float* %D, i32 %index
+  %8 = bitcast float* %7 to <4 x float>*
+  store <4 x float> %6, <4 x float>* %8, align 4
+  %index.next = add i32 %index, 4
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fma_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fma_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
+; CHECK-NEXT:  .LBB9_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vfma.f32 q2, q0, q1
+; CHECK-NEXT:    vstrb.8 q2, [r2], #16
+; CHECK-NEXT:    bne .LBB9_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
+  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
+  %4 = getelementptr inbounds float, float* %B, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  %wide.load14 = load <4 x float>, <4 x float>* %5, align 4
+  %6 = fadd fast <4 x float> %3, %wide.load14
+  %7 = getelementptr inbounds float, float* %D, i32 %index
+  %8 = bitcast float* %7 to <4 x float>*
+  store <4 x float> %6, <4 x float>* %8, align 4
+  %index.next = add i32 %index, 4
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+
+define arm_aapcs_vfpcc void @test_fmss(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fmss:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
+; CHECK-NEXT:    vneg.f32 q0, q0
+; CHECK-NEXT:  .LBB10_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vfma.f32 q3, q2, q1
+; CHECK-NEXT:    vstrb.8 q3, [r2], #16
+; CHECK-NEXT:    bne .LBB10_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
+  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = getelementptr inbounds float, float* %B, i32 %index
+  %4 = bitcast float* %3 to <4 x float>*
+  %wide.load12 = load <4 x float>, <4 x float>* %4, align 4
+  %5 = fmul fast <4 x float> %wide.load12, %wide.load
+  %6 = fsub fast <4 x float> %5, %broadcast.splat14
+  %7 = getelementptr inbounds float, float* %D, i32 %index
+  %8 = bitcast float* %7 to <4 x float>*
+  store <4 x float> %6, <4 x float>* %8, align 4
+  %index.next = add i32 %index, 4
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fmss_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fmss_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
+; CHECK-NEXT:  .LBB11_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vfms.f32 q3, q2, q1
+; CHECK-NEXT:    vstrb.8 q3, [r2], #16
+; CHECK-NEXT:    bne .LBB11_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
+  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = getelementptr inbounds float, float* %B, i32 %index
+  %4 = bitcast float* %3 to <4 x float>*
+  %wide.load12 = load <4 x float>, <4 x float>* %4, align 4
+  %5 = fmul fast <4 x float> %wide.load12, %wide.load
+  %6 = fsub fast <4 x float> %broadcast.splat14, %5
+  %7 = getelementptr inbounds float, float* %D, i32 %index
+  %8 = bitcast float* %7 to <4 x float>*
+  store <4 x float> %6, <4 x float>* %8, align 4
+  %index.next = add i32 %index, 4
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fms(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fms:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
+; CHECK-NEXT:  .LBB12_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vneg.f32 q1, q1
+; CHECK-NEXT:    vfma.f32 q1, q2, q0
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    bne .LBB12_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
+  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
+  %4 = getelementptr inbounds float, float* %B, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  %wide.load14 = load <4 x float>, <4 x float>* %5, align 4
+  %6 = fsub fast <4 x float> %3, %wide.load14
+  %7 = getelementptr inbounds float, float* %D, i32 %index
+  %8 = bitcast float* %7 to <4 x float>*
+  store <4 x float> %6, <4 x float>* %8, align 4
+  %index.next = add i32 %index, 4
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+define arm_aapcs_vfpcc void @test_fms_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
+; CHECK-LABEL: test_fms_r:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r3, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
+; CHECK-NEXT:  .LBB13_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vneg.f32 q1, q1
+; CHECK-NEXT:    vfma.f32 q1, q0, q2
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    bne .LBB13_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and i32 %n, 7
+  %cmp = icmp eq i32 %0, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
+  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %A, i32 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
+  %4 = getelementptr inbounds float, float* %B, i32 %index
+  %5 = bitcast float* %4 to <4 x float>*
+  %wide.load14 = load <4 x float>, <4 x float>* %5, align 4
+  %6 = fsub fast <4 x float> %3, %wide.load14
+  %7 = getelementptr inbounds float, float* %D, i32 %index
+  %8 = bitcast float* %7 to <4 x float>*
+  store <4 x float> %6, <4 x float>* %8, align 4
+  %index.next = add i32 %index, 4
+  %9 = icmp eq i32 %index.next, %n
+  br i1 %9, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+
+define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noalias nocapture readonly %pOutT1, float* noalias nocapture readonly %pPRT_in, float* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, float %in) local_unnamed_addr #0 {
+; CHECK-LABEL: test_nested:
+; CHECK:       @ %bb.0: @ %for.body.us.preheader
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    ldrd lr, r12, [sp, #20]
+; CHECK-NEXT:    lsl.w r3, r12, #2
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB14_1: @ %for.body.us
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB14_2 Depth 2
+; CHECK-NEXT:    vldr s0, [r1]
+; CHECK-NEXT:    mov r5, r12
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vdup.32 q0, r4
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:  .LBB14_2: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB14_1 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    adds r6, r0, r4
+; CHECK-NEXT:    adds r7, r2, r4
+; CHECK-NEXT:    vldrw.u32 q1, [r7]
+; CHECK-NEXT:    vldrw.u32 q2, [r6]
+; CHECK-NEXT:    adds r4, #16
+; CHECK-NEXT:    subs r5, #4
+; CHECK-NEXT:    vfms.f32 q2, q1, q0
+; CHECK-NEXT:    vstrw.32 q2, [r6]
+; CHECK-NEXT:    bne .LBB14_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond6.for.end_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB14_1 Depth=1
+; CHECK-NEXT:    add r0, r3
+; CHECK-NEXT:    add r2, r3
+; CHECK-NEXT:    adds r1, #4
+; CHECK-NEXT:    le lr, .LBB14_1
+; CHECK-NEXT:  @ %bb.4: @ %for.end14
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+for.body.us.preheader:
+  %cmp = icmp sgt i32 %numRows, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp1 = icmp sgt i32 %numCols, 0
+  tail call void @llvm.assume(i1 %cmp1)
+  %rem = and i32 %numCols, 7
+  %cmp2 = icmp eq i32 %rem, 0
+  tail call void @llvm.assume(i1 %cmp2)
+  %cmp3 = icmp slt i32 %l, %numCols
+  tail call void @llvm.assume(i1 %cmp3)
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
+  %pInT1.addr.038.us = phi float* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
+  %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  %pOutT1.addr.036.us = phi float* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
+  %pPRT_in.addr.035.us = phi float* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
+  %scevgep = getelementptr float, float* %pPRT_in.addr.035.us, i32 %numCols
+  %0 = load float, float* %pOutT1.addr.036.us, align 4
+  %broadcast.splatinsert47 = insertelement <4 x float> undef, float %0, i32 0
+  %broadcast.splat48 = shufflevector <4 x float> %broadcast.splatinsert47, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %for.body.us
+  %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
+  %next.gep = getelementptr float, float* %pInT1.addr.038.us, i32 %index
+  %next.gep45 = getelementptr float, float* %pPRT_in.addr.035.us, i32 %index
+  %1 = bitcast float* %next.gep to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %1, align 4
+  %2 = bitcast float* %next.gep45 to <4 x float>*
+  %wide.load46 = load <4 x float>, <4 x float>* %2, align 4
+  %3 = fmul fast <4 x float> %wide.load46, %broadcast.splat48
+  %4 = fsub fast <4 x float> %wide.load, %3
+  store <4 x float> %4, <4 x float>* %1, align 4
+  %index.next = add i32 %index, 4
+  %5 = icmp eq i32 %index.next, %numCols
+  br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body
+
+for.cond6.for.end_crit_edge.us:                   ; preds = %vector.body
+  %incdec.ptr.us = getelementptr inbounds float, float* %pOutT1.addr.036.us, i32 1
+  %scevgep40 = getelementptr float, float* %pInT1.addr.038.us, i32 %numCols
+  %inc13.us = add nuw nsw i32 %i.037.us, 1
+  %exitcond41 = icmp eq i32 %inc13.us, %numRows
+  br i1 %exitcond41, label %for.end14, label %for.body.us
+
+for.end14:                                        ; preds = %for.cond6.for.end_crit_edge.us
+  ret void
+}
+
+%struct.arm_fir_instance_f32 = type { i16, float*, float* }
+define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* %pDst, i32 %blockSize) {
+; CHECK-LABEL: arm_fir_f32_1_4_mve:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    ldrh.w r10, [r0]
+; CHECK-NEXT:    ldr.w r12, [r0, #4]
+; CHECK-NEXT:    sub.w r7, r10, #1
+; CHECK-NEXT:    cmp r7, #3
+; CHECK-NEXT:    bhi .LBB15_6
+; CHECK-NEXT:  @ %bb.1: @ %if.then
+; CHECK-NEXT:    ldr r6, [r0, #8]
+; CHECK-NEXT:    add.w r4, r12, r7, lsl #2
+; CHECK-NEXT:    lsr.w lr, r3, #2
+; CHECK-NEXT:    vldr s0, [r6, #12]
+; CHECK-NEXT:    vldr s4, [r6, #8]
+; CHECK-NEXT:    vmov r7, s0
+; CHECK-NEXT:    vldr s8, [r6, #4]
+; CHECK-NEXT:    vdup.32 q0, r7
+; CHECK-NEXT:    vmov r7, s4
+; CHECK-NEXT:    vldr s12, [r6]
+; CHECK-NEXT:    vdup.32 q1, r7
+; CHECK-NEXT:    vmov r7, s8
+; CHECK-NEXT:    vdup.32 q2, r7
+; CHECK-NEXT:    vmov r7, s12
+; CHECK-NEXT:    vdup.32 q3, r7
+; CHECK-NEXT:    wls lr, lr, .LBB15_5
+; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
+; CHECK-NEXT:    bic r9, r3, #3
+; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    add.w r8, r2, r9, lsl #2
+; CHECK-NEXT:  .LBB15_3: @ %while.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    adds r5, r1, r6
+; CHECK-NEXT:    adds r7, r2, r6
+; CHECK-NEXT:    vldrw.u32 q4, [r5]
+; CHECK-NEXT:    adds r5, r4, r6
+; CHECK-NEXT:    vstrw.32 q4, [r5]
+; CHECK-NEXT:    add.w r5, r12, r6
+; CHECK-NEXT:    vldrw.u32 q4, [r5]
+; CHECK-NEXT:    vldrw.u32 q5, [r5, #4]
+; CHECK-NEXT:    vldrw.u32 q6, [r5, #12]
+; CHECK-NEXT:    adds r6, #16
+; CHECK-NEXT:    vmul.f32 q4, q4, q3
+; CHECK-NEXT:    vfma.f32 q4, q5, q2
+; CHECK-NEXT:    vldrw.u32 q5, [r5, #8]
+; CHECK-NEXT:    vfma.f32 q4, q5, q1
+; CHECK-NEXT:    vfma.f32 q4, q6, q0
+; CHECK-NEXT:    vstrw.32 q4, [r7]
+; CHECK-NEXT:    le lr, .LBB15_3
+; CHECK-NEXT:  @ %bb.4: @ %while.end.loopexit
+; CHECK-NEXT:    add r4, r6
+; CHECK-NEXT:    add.w r12, r12, r9, lsl #2
+; CHECK-NEXT:    add.w r1, r1, r9, lsl #2
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:  .LBB15_5: @ %while.end
+; CHECK-NEXT:    and r7, r3, #3
+; CHECK-NEXT:    vldrw.u32 q4, [r1]
+; CHECK-NEXT:    vctp.32 r7
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q4, [r4]
+; CHECK-NEXT:    vldrw.u32 q4, [r12]
+; CHECK-NEXT:    vmul.f32 q3, q4, q3
+; CHECK-NEXT:    vldrw.u32 q4, [r12, #4]
+; CHECK-NEXT:    vfma.f32 q3, q4, q2
+; CHECK-NEXT:    vldrw.u32 q2, [r12, #8]
+; CHECK-NEXT:    vfma.f32 q3, q2, q1
+; CHECK-NEXT:    vldrw.u32 q1, [r12, #12]
+; CHECK-NEXT:    vfma.f32 q3, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q3, [r2]
+; CHECK-NEXT:    ldr.w r12, [r0, #4]
+; CHECK-NEXT:  .LBB15_6: @ %if.end
+; CHECK-NEXT:    add.w r0, r12, r3, lsl #2
+; CHECK-NEXT:    lsr.w lr, r10, #2
+; CHECK-NEXT:    wls lr, lr, .LBB15_10
+; CHECK-NEXT:  @ %bb.7: @ %while.body51.preheader
+; CHECK-NEXT:    bic r2, r10, #3
+; CHECK-NEXT:    adds r1, r2, r3
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    add.w r1, r12, r1, lsl #2
+; CHECK-NEXT:  .LBB15_8: @ %while.body51
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
+; CHECK-NEXT:    vstrb.8 q0, [r3], #16
+; CHECK-NEXT:    le lr, .LBB15_8
+; CHECK-NEXT:  @ %bb.9: @ %while.end55.loopexit
+; CHECK-NEXT:    add.w r12, r12, r2, lsl #2
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:  .LBB15_10: @ %while.end55
+; CHECK-NEXT:    ands r1, r10, #3
+; CHECK-NEXT:    beq .LBB15_12
+; CHECK-NEXT:  @ %bb.11: @ %if.then59
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vctp.32 r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q0, [r12]
+; CHECK-NEXT:  .LBB15_12: @ %if.end61
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+entry:
+  %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
+  %0 = load float*, float** %pState1, align 4
+  %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
+  %1 = load float*, float** %pCoeffs2, align 4
+  %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
+  %2 = load i16, i16* %numTaps3, align 4
+  %conv = zext i16 %2 to i32
+  %sub = add nsw i32 %conv, -1
+  %cmp = icmp ult i32 %sub, 4
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds float, float* %0, i32 %sub
+  %incdec.ptr = getelementptr inbounds float, float* %1, i32 1
+  %3 = load float, float* %1, align 4
+  %incdec.ptr6 = getelementptr inbounds float, float* %1, i32 2
+  %4 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr7 = getelementptr inbounds float, float* %1, i32 3
+  %5 = load float, float* %incdec.ptr6, align 4
+  %6 = load float, float* %incdec.ptr7, align 4
+  %shr = lshr i32 %blockSize, 2
+  %cmp9146 = icmp eq i32 %shr, 0
+  %.pre161 = insertelement <4 x float> undef, float %3, i32 0
+  %.pre162 = shufflevector <4 x float> %.pre161, <4 x float> undef, <4 x i32> zeroinitializer
+  %.pre163 = insertelement <4 x float> undef, float %4, i32 0
+  %.pre164 = shufflevector <4 x float> %.pre163, <4 x float> undef, <4 x i32> zeroinitializer
+  %.pre165 = insertelement <4 x float> undef, float %5, i32 0
+  %.pre166 = shufflevector <4 x float> %.pre165, <4 x float> undef, <4 x i32> zeroinitializer
+  %.pre167 = insertelement <4 x float> undef, float %6, i32 0
+  %.pre168 = shufflevector <4 x float> %.pre167, <4 x float> undef, <4 x i32> zeroinitializer
+  br i1 %cmp9146, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %if.then
+  %7 = and i32 %blockSize, -4
+  %scevgep158 = getelementptr float, float* %pDst, i32 %7
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %pStateCur.0151 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+  %pSamples.0150 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
+  %pOutput.0149 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
+  %pTempSrc.0148 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
+  %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
+  %8 = bitcast float* %pTempSrc.0148 to <4 x float>*
+  %9 = load <4 x float>, <4 x float>* %8, align 4
+  %10 = bitcast float* %pStateCur.0151 to <4 x float>*
+  store <4 x float> %9, <4 x float>* %10, align 4
+  %add.ptr = getelementptr inbounds float, float* %pStateCur.0151, i32 4
+  %add.ptr11 = getelementptr inbounds float, float* %pTempSrc.0148, i32 4
+  %11 = bitcast float* %pSamples.0150 to <4 x float>*
+  %12 = load <4 x float>, <4 x float>* %11, align 4
+  %13 = fmul fast <4 x float> %12, %.pre162
+  %arrayidx12 = getelementptr inbounds float, float* %pSamples.0150, i32 1
+  %14 = bitcast float* %arrayidx12 to <4 x float>*
+  %15 = load <4 x float>, <4 x float>* %14, align 4
+  %mul = fmul fast <4 x float> %15, %.pre164
+  %add = fadd fast <4 x float> %mul, %13
+  %arrayidx13 = getelementptr inbounds float, float* %pSamples.0150, i32 2
+  %16 = bitcast float* %arrayidx13 to <4 x float>*
+  %17 = load <4 x float>, <4 x float>* %16, align 4
+  %mul16 = fmul fast <4 x float> %17, %.pre166
+  %add17 = fadd fast <4 x float> %add, %mul16
+  %arrayidx18 = getelementptr inbounds float, float* %pSamples.0150, i32 3
+  %18 = bitcast float* %arrayidx18 to <4 x float>*
+  %19 = load <4 x float>, <4 x float>* %18, align 4
+  %mul21 = fmul fast <4 x float> %19, %.pre168
+  %add22 = fadd fast <4 x float> %add17, %mul21
+  %20 = bitcast float* %pOutput.0149 to <4 x float>*
+  store <4 x float> %add22, <4 x float>* %20, align 4
+  %add.ptr23 = getelementptr inbounds float, float* %pOutput.0149, i32 4
+  %add.ptr24 = getelementptr inbounds float, float* %pSamples.0150, i32 4
+  %dec = add nsw i32 %blkCnt.0147, -1
+  %cmp9 = icmp eq i32 %dec, 0
+  br i1 %cmp9, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %scevgep157 = getelementptr float, float* %pSrc, i32 %7
+  %scevgep159 = getelementptr float, float* %0, i32 %7
+  br label %while.end
+
+while.end:                                        ; preds = %if.then, %while.end.loopexit
+  %pTempSrc.0.lcssa = phi float* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
+  %pOutput.0.lcssa = phi float* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
+  %pSamples.0.lcssa = phi float* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ]
+  %pStateCur.0.lcssa = phi float* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
+  %and = and i32 %blockSize, 3
+  %21 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and)
+  %22 = bitcast float* %pTempSrc.0.lcssa to <4 x float>*
+  %23 = load <4 x float>, <4 x float>* %22, align 4
+  %24 = bitcast float* %pStateCur.0.lcssa to <4 x float>*
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %23, <4 x float>* %24, i32 4, <4 x i1> %21)
+  %25 = bitcast float* %pSamples.0.lcssa to <4 x float>*
+  %26 = load <4 x float>, <4 x float>* %25, align 4
+  %27 = fmul fast <4 x float> %26, %.pre162
+  %arrayidx29 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 1
+  %28 = bitcast float* %arrayidx29 to <4 x float>*
+  %29 = load <4 x float>, <4 x float>* %28, align 4
+  %mul32 = fmul fast <4 x float> %29, %.pre164
+  %add33 = fadd fast <4 x float> %mul32, %27
+  %arrayidx34 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 2
+  %30 = bitcast float* %arrayidx34 to <4 x float>*
+  %31 = load <4 x float>, <4 x float>* %30, align 4
+  %mul37 = fmul fast <4 x float> %31, %.pre166
+  %add38 = fadd fast <4 x float> %add33, %mul37
+  %arrayidx39 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 3
+  %32 = bitcast float* %arrayidx39 to <4 x float>*
+  %33 = load <4 x float>, <4 x float>* %32, align 4
+  %mul42 = fmul fast <4 x float> %33, %.pre168
+  %add43 = fadd fast <4 x float> %add38, %mul42
+  %34 = bitcast float* %pOutput.0.lcssa to <4 x float>*
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %add43, <4 x float>* %34, i32 4, <4 x i1> %21)
+  %.pre = load float*, float** %pState1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %while.end, %entry
+  %35 = phi float* [ %.pre, %while.end ], [ %0, %entry ]
+  %arrayidx45 = getelementptr inbounds float, float* %35, i32 %blockSize
+  %shr47 = lshr i32 %conv, 2
+  %cmp49141 = icmp eq i32 %shr47, 0
+  br i1 %cmp49141, label %while.end55, label %while.body51.preheader
+
+while.body51.preheader:                           ; preds = %if.end
+  %36 = and i32 %conv, 65532
+  %37 = add i32 %36, %blockSize
+  %scevgep = getelementptr float, float* %35, i32 %37
+  br label %while.body51
+
+while.body51:                                     ; preds = %while.body51.preheader, %while.body51
+  %pTempSrc.1144 = phi float* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
+  %pTempDest.0143 = phi float* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ]
+  %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
+  %38 = bitcast float* %pTempSrc.1144 to <4 x float>*
+  %39 = load <4 x float>, <4 x float>* %38, align 4
+  %40 = bitcast float* %pTempDest.0143 to <4 x float>*
+  store <4 x float> %39, <4 x float>* %40, align 4
+  %add.ptr52 = getelementptr inbounds float, float* %pTempSrc.1144, i32 4
+  %add.ptr53 = getelementptr inbounds float, float* %pTempDest.0143, i32 4
+  %dec54 = add nsw i32 %blkCnt.1142, -1
+  %cmp49 = icmp eq i32 %dec54, 0
+  br i1 %cmp49, label %while.end55.loopexit, label %while.body51
+
+while.end55.loopexit:                             ; preds = %while.body51
+  %scevgep156 = getelementptr float, float* %35, i32 %36
+  br label %while.end55
+
+while.end55:                                      ; preds = %while.end55.loopexit, %if.end
+  %pTempDest.0.lcssa = phi float* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
+  %pTempSrc.1.lcssa = phi float* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
+  %and56 = and i32 %conv, 3
+  %cmp57 = icmp eq i32 %and56, 0
+  br i1 %cmp57, label %if.end61, label %if.then59
+
+if.then59:                                        ; preds = %while.end55
+  %41 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and56)
+  %42 = bitcast float* %pTempSrc.1.lcssa to <4 x float>*
+  %43 = load <4 x float>, <4 x float>* %42, align 4
+  %44 = bitcast float* %pTempDest.0.lcssa to <4 x float>*
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %43, <4 x float>* %44, i32 4, <4 x i1> %41)
+  br label %if.end61
+
+if.end61:                                         ; preds = %while.end55, %if.then59
+  ret void
+}
+
+
+define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) {
+; CHECK-LABEL: fir:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #88
+; CHECK-NEXT:    sub sp, #88
+; CHECK-NEXT:    cmp r3, #8
+; CHECK-NEXT:    blo.w .LBB16_12
+; CHECK-NEXT:  @ %bb.1: @ %if.then
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    cmp.w r7, r3, lsr #2
+; CHECK-NEXT:    beq.w .LBB16_12
+; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
+; CHECK-NEXT:    ldrh r4, [r0]
+; CHECK-NEXT:    lsr.w r8, r3, #2
+; CHECK-NEXT:    ldrd r5, r12, [r0, #4]
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    sub.w r0, r4, #8
+; CHECK-NEXT:    and r10, r0, #7
+; CHECK-NEXT:    add.w r7, r0, r0, lsr #29
+; CHECK-NEXT:    add.w r0, r10, #1
+; CHECK-NEXT:    asrs r6, r7, #3
+; CHECK-NEXT:    cmp r6, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    asrgt r3, r7, #3
+; CHECK-NEXT:    add.w r7, r5, r4, lsl #2
+; CHECK-NEXT:    sub.w r11, r7, #4
+; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    rsbs r3, r4, #0
+; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    add.w r3, r12, #32
+; CHECK-NEXT:    str r4, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:  .LBB16_3: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    subs.w r8, r8, #1
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    add.w r0, r9, r0, lsl #2
+; CHECK-NEXT:    add.w r5, r0, #16
+; CHECK-NEXT:    beq.w .LBB16_12
+; CHECK-NEXT:  .LBB16_4: @ %while.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
+; CHECK-NEXT:    vldr s2, [r12, #12]
+; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
+; CHECK-NEXT:    vldr s8, [r12, #28]
+; CHECK-NEXT:    add.w r9, r5, #32
+; CHECK-NEXT:    vldr s0, [r12]
+; CHECK-NEXT:    vstr s2, [sp, #64] @ 4-byte Spill
+; CHECK-NEXT:    vmov r6, s8
+; CHECK-NEXT:    vldr s2, [r12, #16]
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vldr s4, [r12, #20]
+; CHECK-NEXT:    vldr s6, [r12, #24]
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vldr s5, [r12, #4]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vldr s7, [r12, #8]
+; CHECK-NEXT:    vstrb.8 q3, [r11], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r5, #28]
+; CHECK-NEXT:    vldrw.u32 q4, [r5]
+; CHECK-NEXT:    vldrw.u32 q5, [r5, #4]
+; CHECK-NEXT:    vldrw.u32 q3, [r5, #20]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r5, #24]
+; CHECK-NEXT:    vldrw.u32 q6, [r5, #12]
+; CHECK-NEXT:    vldrw.u32 q7, [r5, #16]
+; CHECK-NEXT:    vmul.f32 q0, q4, r3
+; CHECK-NEXT:    vldrw.u32 q4, [r5, #8]
+; CHECK-NEXT:    vmov r3, s5
+; CHECK-NEXT:    vfma.f32 q0, q5, r3
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vfma.f32 q0, q4, r3
+; CHECK-NEXT:    vldr s4, [sp, #64] @ 4-byte Reload
+; CHECK-NEXT:    vmov r7, s6
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vfma.f32 q0, q6, r3
+; CHECK-NEXT:    vfma.f32 q0, q7, r4
+; CHECK-NEXT:    vfma.f32 q0, q3, r0
+; CHECK-NEXT:    vfma.f32 q0, q2, r7
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f32 q0, q1, r6
+; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    cmp r0, #16
+; CHECK-NEXT:    blo .LBB16_8
+; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    ldr.w lr, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:  .LBB16_6: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldrw.u32 q1, [r9, #28]
+; CHECK-NEXT:    vldr s24, [r6]
+; CHECK-NEXT:    vldr s26, [r6, #4]
+; CHECK-NEXT:    vldrw.u32 q3, [r9, #4]
+; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r9, #20]
+; CHECK-NEXT:    vldr s28, [r6, #8]
+; CHECK-NEXT:    vmov r7, s24
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r9, #24]
+; CHECK-NEXT:    vldr s25, [r6, #16]
+; CHECK-NEXT:    vldrw.u32 q5, [r9, #12]
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r9]
+; CHECK-NEXT:    vldr s27, [r6, #20]
+; CHECK-NEXT:    vldrw.u32 q4, [r9, #16]
+; CHECK-NEXT:    vldr s29, [r6, #24]
+; CHECK-NEXT:    vldrw.u32 q2, [r9, #8]
+; CHECK-NEXT:    vldr s31, [r6, #28]
+; CHECK-NEXT:    vmov r5, s25
+; CHECK-NEXT:    vldr s30, [r6, #12]
+; CHECK-NEXT:    vfma.f32 q0, q1, r7
+; CHECK-NEXT:    vmov r7, s26
+; CHECK-NEXT:    add.w r9, r9, #32
+; CHECK-NEXT:    vfma.f32 q0, q3, r7
+; CHECK-NEXT:    vmov r7, s28
+; CHECK-NEXT:    vfma.f32 q0, q2, r7
+; CHECK-NEXT:    vmov r7, s30
+; CHECK-NEXT:    vfma.f32 q0, q5, r7
+; CHECK-NEXT:    vmov r3, s27
+; CHECK-NEXT:    vfma.f32 q0, q4, r5
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov r4, s29
+; CHECK-NEXT:    adds r6, #32
+; CHECK-NEXT:    vfma.f32 q0, q1, r3
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov r0, s31
+; CHECK-NEXT:    vfma.f32 q0, q1, r4
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f32 q0, q1, r0
+; CHECK-NEXT:    le lr, .LBB16_6
+; CHECK-NEXT:  @ %bb.7: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    bne .LBB16_9
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_8: @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    beq.w .LBB16_3
+; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    mov r5, r9
+; CHECK-NEXT:  .LBB16_10: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldr s4, [r6]
+; CHECK-NEXT:    vldrw.u32 q2, [r5], #4
+; CHECK-NEXT:    subs r0, #1
+; CHECK-NEXT:    adds r6, #4
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    cmp r0, #1
+; CHECK-NEXT:    vfma.f32 q0, q2, r3
+; CHECK-NEXT:    bgt .LBB16_10
+; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    add.w r9, r9, r10, lsl #2
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_12: @ %if.end
+; CHECK-NEXT:    add sp, #88
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+  %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
+  %0 = load float*, float** %pState1, align 4
+  %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
+  %1 = load float*, float** %pCoeffs2, align 4
+  %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
+  %2 = load i16, i16* %numTaps3, align 4
+  %conv = zext i16 %2 to i32
+  %cmp = icmp ugt i32 %blockSize, 7
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %shr = lshr i32 %blockSize, 2
+  %cmp5217 = icmp eq i32 %shr, 0
+  br i1 %cmp5217, label %if.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %if.then
+  %sub = add nsw i32 %conv, -1
+  %arrayidx = getelementptr inbounds float, float* %0, i32 %sub
+  %incdec.ptr = getelementptr inbounds float, float* %1, i32 1
+  %incdec.ptr7 = getelementptr inbounds float, float* %1, i32 2
+  %incdec.ptr8 = getelementptr inbounds float, float* %1, i32 3
+  %incdec.ptr9 = getelementptr inbounds float, float* %1, i32 4
+  %incdec.ptr10 = getelementptr inbounds float, float* %1, i32 5
+  %incdec.ptr11 = getelementptr inbounds float, float* %1, i32 6
+  %incdec.ptr12 = getelementptr inbounds float, float* %1, i32 7
+  %sub37 = add nsw i32 %conv, -8
+  %div = sdiv i32 %sub37, 8
+  %pCoeffsCur.0199 = getelementptr inbounds float, float* %1, i32 8
+  %cmp38201 = icmp ugt i16 %2, 15
+  %and = and i32 %sub37, 7
+  %cmp74210 = icmp eq i32 %and, 0
+  %idx.neg = sub nsw i32 0, %conv
+  %3 = icmp sgt i32 %div, 1
+  %smax = select i1 %3, i32 %div, i32 1
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.end
+  %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
+  %pStateCur.0221 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
+  %pSamples.0220 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
+  %pTempSrc.0219 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
+  %pOutput.0218 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
+  %4 = load float, float* %1, align 4
+  %5 = load float, float* %incdec.ptr, align 4
+  %6 = load float, float* %incdec.ptr7, align 4
+  %7 = load float, float* %incdec.ptr8, align 4
+  %8 = load float, float* %incdec.ptr9, align 4
+  %9 = load float, float* %incdec.ptr10, align 4
+  %10 = load float, float* %incdec.ptr11, align 4
+  %11 = load float, float* %incdec.ptr12, align 4
+  %12 = bitcast float* %pTempSrc.0219 to <4 x float>*
+  %13 = load <4 x float>, <4 x float>* %12, align 4
+  %14 = bitcast float* %pStateCur.0221 to <4 x float>*
+  store <4 x float> %13, <4 x float>* %14, align 4
+  %add.ptr = getelementptr inbounds float, float* %pStateCur.0221, i32 4
+  %add.ptr14 = getelementptr inbounds float, float* %pTempSrc.0219, i32 4
+  %15 = bitcast float* %pSamples.0220 to <4 x float>*
+  %16 = load <4 x float>, <4 x float>* %15, align 4
+  %.splatinsert = insertelement <4 x float> undef, float %4, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %17 = fmul fast <4 x float> %16, %.splat
+  %arrayidx15 = getelementptr inbounds float, float* %pSamples.0220, i32 1
+  %18 = bitcast float* %arrayidx15 to <4 x float>*
+  %19 = load <4 x float>, <4 x float>* %18, align 4
+  %.splatinsert16 = insertelement <4 x float> undef, float %5, i32 0
+  %.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer
+  %20 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %19, <4 x float> %.splat17, <4 x float> %17)
+  %arrayidx18 = getelementptr inbounds float, float* %pSamples.0220, i32 2
+  %21 = bitcast float* %arrayidx18 to <4 x float>*
+  %22 = load <4 x float>, <4 x float>* %21, align 4
+  %.splatinsert19 = insertelement <4 x float> undef, float %6, i32 0
+  %.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
+  %23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %22, <4 x float> %.splat20, <4 x float> %20)
+  %arrayidx21 = getelementptr inbounds float, float* %pSamples.0220, i32 3
+  %24 = bitcast float* %arrayidx21 to <4 x float>*
+  %25 = load <4 x float>, <4 x float>* %24, align 4
+  %.splatinsert22 = insertelement <4 x float> undef, float %7, i32 0
+  %.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer
+  %26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %25, <4 x float> %.splat23, <4 x float> %23)
+  %arrayidx24 = getelementptr inbounds float, float* %pSamples.0220, i32 4
+  %27 = bitcast float* %arrayidx24 to <4 x float>*
+  %28 = load <4 x float>, <4 x float>* %27, align 4
+  %.splatinsert25 = insertelement <4 x float> undef, float %8, i32 0
+  %.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer
+  %29 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %28, <4 x float> %.splat26, <4 x float> %26)
+  %arrayidx27 = getelementptr inbounds float, float* %pSamples.0220, i32 5
+  %30 = bitcast float* %arrayidx27 to <4 x float>*
+  %31 = load <4 x float>, <4 x float>* %30, align 4
+  %.splatinsert28 = insertelement <4 x float> undef, float %9, i32 0
+  %.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer
+  %32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %31, <4 x float> %.splat29, <4 x float> %29)
+  %arrayidx30 = getelementptr inbounds float, float* %pSamples.0220, i32 6
+  %33 = bitcast float* %arrayidx30 to <4 x float>*
+  %34 = load <4 x float>, <4 x float>* %33, align 4
+  %.splatinsert31 = insertelement <4 x float> undef, float %10, i32 0
+  %.splat32 = shufflevector <4 x float> %.splatinsert31, <4 x float> undef, <4 x i32> zeroinitializer
+  %35 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %34, <4 x float> %.splat32, <4 x float> %32)
+  %arrayidx33 = getelementptr inbounds float, float* %pSamples.0220, i32 7
+  %36 = bitcast float* %arrayidx33 to <4 x float>*
+  %37 = load <4 x float>, <4 x float>* %36, align 4
+  %.splatinsert34 = insertelement <4 x float> undef, float %11, i32 0
+  %.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer
+  %38 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %37, <4 x float> %.splat35, <4 x float> %35)
+  %pSamples.1200 = getelementptr inbounds float, float* %pSamples.0220, i32 8
+  br i1 %cmp38201, label %for.body, label %for.end
+
+for.body:                                         ; preds = %while.body, %for.body
+  %pSamples.1207 = phi float* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
+  %pCoeffsCur.0206 = phi float* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
+  %.pn205 = phi float* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ]
+  %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
+  %vecAcc0.0203 = phi <4 x float> [ %70, %for.body ], [ %38, %while.body ]
+  %pSamples.0.pn202 = phi float* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
+  %incdec.ptr40 = getelementptr inbounds float, float* %.pn205, i32 9
+  %39 = load float, float* %pCoeffsCur.0206, align 4
+  %incdec.ptr41 = getelementptr inbounds float, float* %.pn205, i32 10
+  %40 = load float, float* %incdec.ptr40, align 4
+  %incdec.ptr42 = getelementptr inbounds float, float* %.pn205, i32 11
+  %41 = load float, float* %incdec.ptr41, align 4
+  %incdec.ptr43 = getelementptr inbounds float, float* %.pn205, i32 12
+  %42 = load float, float* %incdec.ptr42, align 4
+  %incdec.ptr44 = getelementptr inbounds float, float* %.pn205, i32 13
+  %43 = load float, float* %incdec.ptr43, align 4
+  %incdec.ptr45 = getelementptr inbounds float, float* %.pn205, i32 14
+  %44 = load float, float* %incdec.ptr44, align 4
+  %incdec.ptr46 = getelementptr inbounds float, float* %.pn205, i32 15
+  %45 = load float, float* %incdec.ptr45, align 4
+  %46 = load float, float* %incdec.ptr46, align 4
+  %47 = bitcast float* %pSamples.1207 to <4 x float>*
+  %48 = load <4 x float>, <4 x float>* %47, align 4
+  %.splatinsert48 = insertelement <4 x float> undef, float %39, i32 0
+  %.splat49 = shufflevector <4 x float> %.splatinsert48, <4 x float> undef, <4 x i32> zeroinitializer
+  %49 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %48, <4 x float> %.splat49, <4 x float> %vecAcc0.0203)
+  %arrayidx50 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 9
+  %50 = bitcast float* %arrayidx50 to <4 x float>*
+  %51 = load <4 x float>, <4 x float>* %50, align 4
+  %.splatinsert51 = insertelement <4 x float> undef, float %40, i32 0
+  %.splat52 = shufflevector <4 x float> %.splatinsert51, <4 x float> undef, <4 x i32> zeroinitializer
+  %52 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %51, <4 x float> %.splat52, <4 x float> %49)
+  %arrayidx53 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 10
+  %53 = bitcast float* %arrayidx53 to <4 x float>*
+  %54 = load <4 x float>, <4 x float>* %53, align 4
+  %.splatinsert54 = insertelement <4 x float> undef, float %41, i32 0
+  %.splat55 = shufflevector <4 x float> %.splatinsert54, <4 x float> undef, <4 x i32> zeroinitializer
+  %55 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %54, <4 x float> %.splat55, <4 x float> %52)
+  %arrayidx56 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 11
+  %56 = bitcast float* %arrayidx56 to <4 x float>*
+  %57 = load <4 x float>, <4 x float>* %56, align 4
+  %.splatinsert57 = insertelement <4 x float> undef, float %42, i32 0
+  %.splat58 = shufflevector <4 x float> %.splatinsert57, <4 x float> undef, <4 x i32> zeroinitializer
+  %58 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %57, <4 x float> %.splat58, <4 x float> %55)
+  %arrayidx59 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 12
+  %59 = bitcast float* %arrayidx59 to <4 x float>*
+  %60 = load <4 x float>, <4 x float>* %59, align 4
+  %.splatinsert60 = insertelement <4 x float> undef, float %43, i32 0
+  %.splat61 = shufflevector <4 x float> %.splatinsert60, <4 x float> undef, <4 x i32> zeroinitializer
+  %61 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %60, <4 x float> %.splat61, <4 x float> %58)
+  %arrayidx62 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 13
+  %62 = bitcast float* %arrayidx62 to <4 x float>*
+  %63 = load <4 x float>, <4 x float>* %62, align 4
+  %.splatinsert63 = insertelement <4 x float> undef, float %44, i32 0
+  %.splat64 = shufflevector <4 x float> %.splatinsert63, <4 x float> undef, <4 x i32> zeroinitializer
+  %64 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %63, <4 x float> %.splat64, <4 x float> %61)
+  %arrayidx65 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 14
+  %65 = bitcast float* %arrayidx65 to <4 x float>*
+  %66 = load <4 x float>, <4 x float>* %65, align 4
+  %.splatinsert66 = insertelement <4 x float> undef, float %45, i32 0
+  %.splat67 = shufflevector <4 x float> %.splatinsert66, <4 x float> undef, <4 x i32> zeroinitializer
+  %67 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %66, <4 x float> %.splat67, <4 x float> %64)
+  %arrayidx68 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 15
+  %68 = bitcast float* %arrayidx68 to <4 x float>*
+  %69 = load <4 x float>, <4 x float>* %68, align 4
+  %.splatinsert69 = insertelement <4 x float> undef, float %46, i32 0
+  %.splat70 = shufflevector <4 x float> %.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer
+  %70 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %69, <4 x float> %.splat70, <4 x float> %67)
+  %inc = add nuw nsw i32 %i.0204, 1
+  %pCoeffsCur.0 = getelementptr inbounds float, float* %pCoeffsCur.0206, i32 8
+  %pSamples.1 = getelementptr inbounds float, float* %pSamples.1207, i32 8
+  %exitcond = icmp eq i32 %inc, %smax
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %while.body
+  %vecAcc0.0.lcssa = phi <4 x float> [ %38, %while.body ], [ %70, %for.body ]
+  %pCoeffsCur.0.lcssa = phi float* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
+  %pSamples.1.lcssa = phi float* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
+  br i1 %cmp74210, label %while.end, label %while.body76
+
+while.body76:                                     ; preds = %for.end, %while.body76
+  %pCoeffsCur.1214 = phi float* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
+  %vecAcc0.1213 = phi <4 x float> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
+  %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
+  %pSamples.2211 = phi float* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
+  %incdec.ptr77 = getelementptr inbounds float, float* %pCoeffsCur.1214, i32 1
+  %71 = load float, float* %pCoeffsCur.1214, align 4
+  %72 = bitcast float* %pSamples.2211 to <4 x float>*
+  %73 = load <4 x float>, <4 x float>* %72, align 4
+  %.splatinsert78 = insertelement <4 x float> undef, float %71, i32 0
+  %.splat79 = shufflevector <4 x float> %.splatinsert78, <4 x float> undef, <4 x i32> zeroinitializer
+  %74 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %73, <4 x float> %.splat79, <4 x float> %vecAcc0.1213)
+  %incdec.ptr80 = getelementptr inbounds float, float* %pSamples.2211, i32 1
+  %dec = add nsw i32 %numCnt.0212, -1
+  %cmp74 = icmp sgt i32 %numCnt.0212, 1
+  br i1 %cmp74, label %while.body76, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body76
+  %scevgep = getelementptr float, float* %pSamples.1.lcssa, i32 %and
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %for.end
+  %pSamples.2.lcssa = phi float* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
+  %vecAcc0.1.lcssa = phi <4 x float> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ]
+  %75 = bitcast float* %pOutput.0218 to <4 x float>*
+  store <4 x float> %vecAcc0.1.lcssa, <4 x float>* %75, align 4
+  %add.ptr81 = getelementptr inbounds float, float* %pOutput.0218, i32 4
+  %add.ptr82 = getelementptr inbounds float, float* %pSamples.2.lcssa, i32 4
+  %add.ptr83 = getelementptr inbounds float, float* %add.ptr82, i32 %idx.neg
+  %dec84 = add nsw i32 %blkCnt.0222, -1
+  %cmp5 = icmp eq i32 %dec84, 0
+  br i1 %cmp5, label %if.end, label %while.body
+
+if.end:                                           ; preds = %while.end, %if.then, %entry
+  ret void
+}
+
+declare void @llvm.assume(i1)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
@@ -18,7 +18,6 @@
 ; CHECK-NEXT:    invoke void @f()
 ; CHECK-NEXT:    to label [[BLOCK3:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       block3:
-; CHECK-NEXT:    store i32 30, i32* [[SV]]
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[CS1:%.*]] = catchswitch within none [label %catch] unwind label [[CLEANUP:%.*]]
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
@@ -27,10 +27,9 @@
 define void @test14(i32* noalias %P) {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
-; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
 ; CHECK-NEXT:    br i1 false, label [[FOR]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
@@ -77,7 +76,8 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false)
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
@@ -281,3 +281,36 @@
   ret void
 }
 
+%struct.hoge = type { i32, i32 }
+
+@global = external local_unnamed_addr global %struct.hoge*, align 8
+
+define void @widget(i8* %tmp) {
+; CHECK-LABEL: @widget(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP:%.*]], i8* nonnull align 16 undef, i64 64, i1 false)
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[TMP2]], i64 undef, i32 1
+; CHECK-NEXT:    store i32 0, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[TMP4]], i64 undef, i32 1
+; CHECK-NEXT:    store i32 10, i32* [[TMP5]], align 4
+; CHECK-NEXT:    br label [[BB1]]
+;
+bb:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %tmp, i8* nonnull align 16 undef, i64 64, i1 false)
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp2 = load %struct.hoge*, %struct.hoge** @global, align 8
+  %tmp3 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp2, i64 undef, i32 1
+  store i32 0, i32* %tmp3, align 4
+  %tmp4 = load %struct.hoge*, %struct.hoge** @global, align 8
+  %tmp5 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp4, i64 undef, i32 1
+  store i32 10, i32* %tmp5, align 4
+  br label %bb1
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg)
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
@@ -33,13 +33,11 @@
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    store i32 1, i32* [[P]]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
 ; CHECK-NEXT:    ret void
 ;
   br i1 true, label %bb1, label %bb2
@@ -58,13 +56,12 @@
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    store i32 1, i32* [[Q:%.*]]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
 ; CHECK-NEXT:    ret void
 ;
   br i1 true, label %bb1, label %bb2
@@ -115,7 +112,6 @@
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br i1 [[C2:%.*]], label [[BB2:%.*]], label [[BB3]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    store i32 -1, i32* [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    br label [[BB4:%.*]]
@@ -126,7 +122,7 @@
 ; CHECK-NEXT:    i32 2, label [[BB7:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb5:
-; CHECK-NEXT:    store i32 0, i32* [[PTR]], align 4
+; CHECK-NEXT:    store i32 0, i32* [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[BB8]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    store i32 1, i32* [[PTR]], align 4
@@ -173,3 +169,34 @@
 bb8:                                              ; preds = %bb7, %bb6, %bb5, %bb4
   br label %bb4
 }
+
+
+declare void @fn1_test11()
+declare void @fn2_test11()
+
+define void @test11(i1 %c, i8** %ptr.1) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @fn2_test11() #0
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8* null, i8** [[PTR_1:%.*]], align 8
+; CHECK-NEXT:    tail call void @fn2_test11() #0
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %if.then, label %exit
+
+if.then:                                      ; preds = %entry
+  tail call void @fn2_test11() #1
+  br label %exit
+
+exit:
+  store i8* null, i8** %ptr.1, align 8
+  tail call void @fn2_test11() #1
+  ret void
+}
+
+attributes #1 = { nounwind }
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
@@ -127,7 +127,7 @@
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    store i32 2, i32* [[P]]
 ; CHECK-NEXT:    ret void
@@ -142,8 +142,109 @@
 bb1:
   br label %bb3
 bb2:
-  ret void
+  br label %bb3
 bb3:
   store i32 2, i32* %P
   ret void
 }
+
+define void @test10(i32* %P) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+; CHECK:       bb3:
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  store i32 0, i32* %P
+  br label %bb3
+bb2:
+  ret void
+bb3:
+  ret void
+}
+
+
+define void @test11() {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[P:%.*]] = alloca i32
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+; CHECK:       bb3:
+; CHECK-NEXT:    ret void
+;
+  %P = alloca i32
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  store i32 0, i32* %P
+  br label %bb3
+bb2:
+  ret void
+bb3:
+  ret void
+}
+
+
+define void @test12(i32* %P) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    ret void
+; CHECK:       bb3:
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  store i32 1, i32* %P
+  br label %bb3
+bb2:
+  store i32 1, i32* %P
+  ret void
+bb3:
+  ret void
+}
+
+
+define void @test13(i32* %P) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 1, i32* [[P]]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  store i32 1, i32* %P
+  br label %bb3
+bb2:
+  store i32 1, i32* %P
+  br label %bb3
+bb3:
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/align-attr.ll b/llvm/test/Transforms/InstCombine/align-attr.ll
--- a/llvm/test/Transforms/InstCombine/align-attr.ll
+++ b/llvm/test/Transforms/InstCombine/align-attr.ll
@@ -20,7 +20,7 @@
 ; CHECK-LABEL: @foo2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[V:%.*]] = call i32* @func1(i32* [[A:%.*]])
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[V]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 32
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/call-returned.ll b/llvm/test/Transforms/InstCombine/call-returned.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/call-returned.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF
+; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON
+
+declare i32 @passthru_i32(i32 returned)
+declare i8* @passthru_p8(i8* returned)
+
+define i32 @returned_const_int_arg() {
+; CHECK-LABEL: @returned_const_int_arg(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @passthru_i32(i32 42)
+; CHECK-NEXT:    ret i32 42
+;
+  %x = call i32 @passthru_i32(i32 42)
+  ret i32 %x
+}
+
+define i8* @returned_const_ptr_arg() {
+; CHECK-LABEL: @returned_const_ptr_arg(
+; CHECK-NEXT:    [[X:%.*]] = call i8* @passthru_p8(i8* null)
+; CHECK-NEXT:    ret i8* null
+;
+  %x = call i8* @passthru_p8(i8* null)
+  ret i8* %x
+}
+
+define i32 @returned_var_arg(i32 %arg) {
+; CHECK-LABEL: @returned_var_arg(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @passthru_i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[ARG]]
+;
+  %x = call i32 @passthru_i32(i32 %arg)
+  ret i32 %x
+}
+
+define i32 @returned_const_int_arg_musttail(i32 %arg) {
+; CHECK-LABEL: @returned_const_int_arg_musttail(
+; CHECK-NEXT:    [[X:%.*]] = musttail call i32 @passthru_i32(i32 42)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = musttail call i32 @passthru_i32(i32 42)
+  ret i32 %x
+}
+
+define i32 @returned_var_arg_musttail(i32 %arg) {
+; CHECK-LABEL: @returned_var_arg_musttail(
+; CHECK-NEXT:    [[X:%.*]] = musttail call i32 @passthru_i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = musttail call i32 @passthru_i32(i32 %arg)
+  ret i32 %x
+}
diff --git a/llvm/test/Transforms/InstCombine/expensive-combines.ll b/llvm/test/Transforms/InstCombine/expensive-combines.ll
--- a/llvm/test/Transforms/InstCombine/expensive-combines.ll
+++ b/llvm/test/Transforms/InstCombine/expensive-combines.ll
@@ -16,7 +16,7 @@
 ;
 ; EXPENSIVE-OFF-LABEL: @test(
 ; EXPENSIVE-OFF-NEXT:    [[CALL:%.*]] = call i32 @passthru(i32 0)
-; EXPENSIVE-OFF-NEXT:    call void @sink(i32 [[CALL]])
+; EXPENSIVE-OFF-NEXT:    call void @sink(i32 0)
 ; EXPENSIVE-OFF-NEXT:    ret void
 ;
   %call = call i32 @passthru(i32 0)
diff --git a/llvm/test/Transforms/InstCombine/fortify-folding.ll b/llvm/test/Transforms/InstCombine/fortify-folding.ll
--- a/llvm/test/Transforms/InstCombine/fortify-folding.ll
+++ b/llvm/test/Transforms/InstCombine/fortify-folding.ll
@@ -82,7 +82,7 @@
 define i8* @test_strcat() {
 ; CHECK-LABEL: @test_strcat(
 ; CHECK-NEXT:    [[STRCAT:%.*]] = call i8* @strcat(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i64 0, i64 0))
-; CHECK-NEXT:    ret i8* [[STRCAT]]
+; CHECK-NEXT:    ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0)
 ;
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0
@@ -126,7 +126,7 @@
 define i8* @test_strncat() {
 ; CHECK-LABEL: @test_strncat(
 ; CHECK-NEXT:    [[STRNCAT:%.*]] = call i8* @strncat(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i64 0, i64 0), i64 22)
-; CHECK-NEXT:    ret i8* [[STRNCAT]]
+; CHECK-NEXT:    ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0)
 ;
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0
diff --git a/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll b/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll
--- a/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll
+++ b/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -53,7 +53,7 @@
 define i8* @test_simplify4() {
 ; CHECK-LABEL: @test_simplify4(
 ; CHECK-NEXT:    [[STRCPY:%.*]] = call i8* @strcpy(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i32 0, i32 0))
-; CHECK-NEXT:    ret i8* [[STRCPY]]
+; CHECK-NEXT:    ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0)
 ;
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0
diff --git a/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll b/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll
--- a/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll
+++ b/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll
@@ -39,7 +39,7 @@
 define i8* @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
 ; CHECK-NEXT:    [[STRNCPY:%.*]] = call i8* @strncpy(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i32 0, i32 0), i32 12)
-; CHECK-NEXT:    ret i8* [[STRNCPY]]
+; CHECK-NEXT:    ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0)
 ;
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0
diff --git a/llvm/test/Transforms/InstCombine/unused-nonnull.ll b/llvm/test/Transforms/InstCombine/unused-nonnull.ll
--- a/llvm/test/Transforms/InstCombine/unused-nonnull.ll
+++ b/llvm/test/Transforms/InstCombine/unused-nonnull.ll
@@ -12,13 +12,8 @@
 ; CHECK-SAME: (i32 [[ARGC:%.*]], i8** nocapture readnone [[ARGV:%.*]]) local_unnamed_addr #0
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i32 [[ARGC]], 2
-; CHECK-NEXT:    br i1 [[TMP0]], label [[DONE:%.*]], label [[DO_WORK:%.*]]
-; CHECK:       do_work:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @compute(i8* undef, i32 [[ARGC]])
-; CHECK-NEXT:    br label [[DONE]]
-; CHECK:       done:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[DO_WORK]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL]]
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[TMP0]], i32 0, i32 [[ARGC]]
+; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
 ;
 entry:
   %0 = getelementptr inbounds i8*, i8** %argv, i32 0
diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll
--- a/llvm/test/Transforms/InstSimplify/call.ll
+++ b/llvm/test/Transforms/InstSimplify/call.ll
@@ -978,6 +978,10 @@
   ret <2 x double> %r
 }
 
+; We handle the "returned" attribute only in InstCombine, because the fact
+; that this simplification may replace one call with another may cause issues
+; for call graph passes.
+
 declare i32 @passthru_i32(i32 returned)
 declare i8* @passthru_p8(i8* returned)
 
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-alloca.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-alloca.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-alloca.ll
@@ -0,0 +1,94 @@
+;RUN: opt -mergesimilarfunc -mergesimilarfunc-level=all -S < %s | FileCheck %s
+;
+; Test whether mergefunc merges allocas of different sizes correctly
+;
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+
+%struct.A = type { i32, i32 }
+%struct.B = type { i32, i32, i32 }
+
+; Function Attrs: nounwind optsize
+define void @f1() #0 {
+; CHECK-LABEL: @f1__merged(
+; CHECK: alloca %struct.A
+; CHECK: alloca %struct.B
+entry:
+  %a = alloca %struct.A, align 4
+  %0 = bitcast %struct.A* %a to i8*
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  ret void
+}
+
+; Function Attrs: optsize
+declare void @externalFun(i8*) #1
+
+; Function Attrs: nounwind optsize
+define void @f2() #0 {
+entry:
+  %a = alloca %struct.B, align 4
+  %0 = bitcast %struct.B* %a to i8*
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  ret void
+}
+
+; Function Attrs: nounwind optsize
+define void @f3() #0 {
+entry:
+  %a = alloca i8, align 1
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  ret void
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind optsize }
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info-2.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info-2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info-2.ll
@@ -0,0 +1,101 @@
+; RUN: opt -S -mergesimilarfunc -mergesimilarfunc-diff-min-insts=5 < %s | FileCheck %s
+; This used to fail with assertion in CloneFunction
+; REQUIRES: asserts
+; CHECK-LABEL: @foo__merged(
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios8.0.0"
+
+%struct.wibble = type { %struct.wibble.0*, %struct.wibble* }
+%struct.wibble.0 = type { i64*, i8*, i64*, i64*, %struct.eggs*, %struct.wibble* }
+%struct.eggs = type { %struct.wombat*, %struct.eggs* }
+%struct.wombat = type { i8*, %struct.blam*, %struct.blam* }
+%struct.blam = type { i8*, %struct.blam* }
+%struct.snork = type { %struct.bar*, %struct.snork* }
+%struct.bar = type { i64*, i8* }
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+define hidden void @foo(%struct.wibble* %arg) #1 align 2 !dbg !4 {
+bb:
+  %tmp = alloca %struct.wibble*, align 8
+  %tmp2 = load %struct.wibble*, %struct.wibble** %tmp, align 8, !dbg !12
+  %tmp3 = icmp ne %struct.wibble* %tmp2, null, !dbg !13
+  br i1 %tmp3, label %bb4, label %bb13, !dbg !14
+
+bb4:                                              ; preds = %bb
+  call void @foo.1() #3, !dbg !26
+  unreachable
+
+bb13:                                             ; preds = %bb
+  ret void, !dbg !27
+}
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+declare hidden void @foo.1() #1 align 2
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+define void @quux() unnamed_addr #1 align 2 !dbg !28 {
+bb:
+  ret void
+}
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+define hidden void @baz(%struct.snork* %arg) #1 align 2 !dbg !30 {
+bb:
+  %tmp = alloca %struct.snork*, align 8
+  %tmp2 = load %struct.snork*, %struct.snork** %tmp, align 8, !dbg !31
+  %tmp3 = icmp ne %struct.snork* %tmp2, null, !dbg !32
+  br i1 %tmp3, label %bb4, label %bb13, !dbg !33
+
+bb4:                                              ; preds = %bb
+  call void @blam() #3, !dbg !42
+  unreachable
+
+bb13:                                             ; preds = %bb
+  ret void, !dbg !43
+}
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+declare hidden void @blam() #1 align 2
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { minsize nounwind optsize ssp uwtable }
+attributes #2 = { nounwind }
+attributes #3 = { minsize optsize }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "(based on LLVM 5.0.0)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "foo.cpp", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = distinct !DISubprogram(name: "delete", scope: !5, file: !5, line: 31, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!5 = !DIFile(filename: "foo.h", directory: "/")
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!9 = !{!"any pointer", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C++ TBAA"}
+!12 = !DILocation(line: 33, column: 11, scope: !4)
+!13 = !DILocation(line: 33, column: 16, scope: !4)
+!14 = !DILocation(line: 33, column: 5, scope: !4)
+!21 = !{!"_ZTSN", !9, i64 0, !9, i64 8}
+!23 = !DILocation(line: 37, column: 25, scope: !4)
+!24 = !DILocation(line: 37, column: 31, scope: !4)
+!25 = !{!21, !9, i64 0}
+!26 = !DILocation(line: 37, column: 7, scope: !4)
+!27 = !DILocation(line: 41, column: 3, scope: !4)
+!28 = distinct !DISubprogram(name: "~destruct", scope: !29, file: !29, line: 31, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!29 = !DIFile(filename: "bar.h", directory: "/")
+!30 = distinct !DISubprogram(name: "delete", scope: !5, file: !5, line: 31, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!31 = !DILocation(line: 33, column: 11, scope: !30)
+!32 = !DILocation(line: 33, column: 16, scope: !30)
+!33 = !DILocation(line: 33, column: 5, scope: !30)
+!40 = !DILocation(line: 37, column: 25, scope: !30)
+!41 = !DILocation(line: 37, column: 31, scope: !30)
+!42 = !DILocation(line: 37, column: 7, scope: !30)
+!43 = !DILocation(line: 41, column: 3, scope: !30)
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info.ll
@@ -0,0 +1,243 @@
+; This used to fail the verifier with the following error:
+; "dbg attachment points at wrong subprogram for function"
+; RUN: opt -S -mergesimilarfunc < %s | FileCheck %s
+; REQUIRES: asserts
+; CHECK-LABEL: @bar__merged(
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+
+%struct.str_type = type { i8*, i32, i32 }
+
+; Function Attrs: nounwind optsize
+define i32 @bar(i8* %b) #0 !dbg !14 {
+entry:
+  %retval = alloca i32, align 4
+  %b.addr = alloca i8*, align 4
+  %res = alloca i32, align 4
+  %ee = alloca %struct.str_type*, align 4
+  %cleanup.dest.slot = alloca i32
+  store i8* %b, i8** %b.addr, align 4, !tbaa !31
+  call void @llvm.dbg.declare(metadata i8** %b.addr, metadata !18, metadata !35), !dbg !36
+  %0 = bitcast i32* %res to i8*, !dbg !37
+  call void @llvm.lifetime.start(i64 4, i8* %0) #4, !dbg !37
+  call void @llvm.dbg.declare(metadata i32* %res, metadata !19, metadata !35), !dbg !38
+  %1 = bitcast %struct.str_type** %ee to i8*, !dbg !39
+  call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !39
+  call void @llvm.dbg.declare(metadata %struct.str_type** %ee, metadata !20, metadata !35), !dbg !40
+  %2 = load i8*, i8** %b.addr, align 4, !dbg !41, !tbaa !31
+  %3 = bitcast i8* %2 to %struct.str_type*, !dbg !42
+  store %struct.str_type* %3, %struct.str_type** %ee, align 4, !dbg !40, !tbaa !31
+  %4 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !43, !tbaa !31
+  %set = getelementptr inbounds %struct.str_type, %struct.str_type* %4, i32 0, i32 1, !dbg !45
+  %5 = load i32, i32* %set, align 4, !dbg !45, !tbaa !46
+  %tobool = icmp ne i32 %5, 0, !dbg !43
+  br i1 %tobool, label %if.end4, label %if.then, !dbg !49
+
+if.then:                                          ; preds = %entry
+  %6 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !50, !tbaa !31
+  %set1 = getelementptr inbounds %struct.str_type, %struct.str_type* %6, i32 0, i32 1, !dbg !52
+  store i32 1, i32* %set1, align 4, !dbg !53, !tbaa !46
+  %7 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !54, !tbaa !31
+  %x = getelementptr inbounds %struct.str_type, %struct.str_type* %7, i32 0, i32 0, !dbg !55
+  %8 = load i8*, i8** %x, align 4, !dbg !55, !tbaa !56
+  %call = call i32 @foo(i8* %8) #5, !dbg !57
+  store i32 %call, i32* %res, align 4, !dbg !58, !tbaa !59
+  %9 = load i32, i32* %res, align 4, !dbg !60, !tbaa !59
+  %tobool2 = icmp ne i32 %9, 0, !dbg !60
+  br i1 %tobool2, label %if.then3, label %if.end, !dbg !62
+
+if.then3:                                         ; preds = %if.then
+  store i32 1, i32* %retval, align 4, !dbg !63
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup, !dbg !63
+
+if.end:                                           ; preds = %if.then
+  br label %if.end4, !dbg !64
+
+if.end4:                                          ; preds = %if.end, %entry
+  store i32 0, i32* %retval, align 4, !dbg !65
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup, !dbg !65
+
+cleanup:                                          ; preds = %if.end4, %if.then3
+  %10 = bitcast %struct.str_type** %ee to i8*, !dbg !66
+  call void @llvm.lifetime.end(i64 4, i8* %10) #4, !dbg !66
+  %11 = bitcast i32* %res to i8*, !dbg !66
+  call void @llvm.lifetime.end(i64 4, i8* %11) #4, !dbg !66
+  %12 = load i32, i32* %retval, align 4, !dbg !66
+  ret i32 %12, !dbg !66
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+
+; Function Attrs: optsize
+declare i32 @foo(i8*) #3
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+
+; Function Attrs: nounwind optsize
+define i32 @bar1(i8* %b) #0 !dbg !21 {
+entry:
+  %retval = alloca i32, align 4
+  %b.addr = alloca i8*, align 4
+  %res = alloca i32, align 4
+  %ee = alloca %struct.str_type*, align 4
+  %cleanup.dest.slot = alloca i32
+  store i8* %b, i8** %b.addr, align 4, !tbaa !31
+  call void @llvm.dbg.declare(metadata i8** %b.addr, metadata !23, metadata !35), !dbg !67
+  %0 = bitcast i32* %res to i8*, !dbg !68
+  call void @llvm.lifetime.start(i64 4, i8* %0) #4, !dbg !68
+  call void @llvm.dbg.declare(metadata i32* %res, metadata !24, metadata !35), !dbg !69
+  %1 = bitcast %struct.str_type** %ee to i8*, !dbg !70
+  call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !70
+  call void @llvm.dbg.declare(metadata %struct.str_type** %ee, metadata !25, metadata !35), !dbg !71
+  %2 = load i8*, i8** %b.addr, align 4, !dbg !72, !tbaa !31
+  %3 = bitcast i8* %2 to %struct.str_type*, !dbg !73
+  store %struct.str_type* %3, %struct.str_type** %ee, align 4, !dbg !71, !tbaa !31
+  %4 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !74, !tbaa !31
+  %get = getelementptr inbounds %struct.str_type, %struct.str_type* %4, i32 0, i32 2, !dbg !76
+  %5 = load i32, i32* %get, align 4, !dbg !76, !tbaa !77
+  %tobool = icmp ne i32 %5, 0, !dbg !74
+  br i1 %tobool, label %if.end4, label %if.then, !dbg !78
+
+if.then:                                          ; preds = %entry
+  %6 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !79, !tbaa !31
+  %get1 = getelementptr inbounds %struct.str_type, %struct.str_type* %6, i32 0, i32 2, !dbg !81
+  store i32 1, i32* %get1, align 4, !dbg !82, !tbaa !77
+  %7 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !83, !tbaa !31
+  %x = getelementptr inbounds %struct.str_type, %struct.str_type* %7, i32 0, i32 0, !dbg !84
+  %8 = load i8*, i8** %x, align 4, !dbg !84, !tbaa !56
+  %call = call i32 @foo(i8* %8) #5, !dbg !85
+  store i32 %call, i32* %res, align 4, !dbg !86, !tbaa !59
+  %9 = load i32, i32* %res, align 4, !dbg !87, !tbaa !59
+  %tobool2 = icmp ne i32 %9, 0, !dbg !87
+  br i1 %tobool2, label %if.then3, label %if.end, !dbg !89
+
+if.then3:                                         ; preds = %if.then
+  store i32 1, i32* %retval, align 4, !dbg !90
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup, !dbg !90
+
+if.end:                                           ; preds = %if.then
+  br label %if.end4, !dbg !91
+
+if.end4:                                          ; preds = %if.end, %entry
+  store i32 0, i32* %retval, align 4, !dbg !92
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup, !dbg !92
+
+cleanup:                                          ; preds = %if.end4, %if.then3
+  %10 = bitcast %struct.str_type** %ee to i8*, !dbg !93
+  call void @llvm.lifetime.end(i64 4, i8* %10) #4, !dbg !93
+  %11 = bitcast i32* %res to i8*, !dbg !93
+  call void @llvm.lifetime.end(i64 4, i8* %11) #4, !dbg !93
+  %12 = load i32, i32* %retval, align 4, !dbg !93
+  ret i32 %12, !dbg !93
+}
+
+attributes #0 = { nounwind optsize }
+attributes #1 = { nounwind readnone }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { optsize }
+attributes #4 = { nounwind }
+attributes #5 = { optsize }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26, !27}
+!llvm.ident = !{!28}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.9.0)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.i", directory: "/local/mnt/")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32, align: 32)
+!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "str_type", file: !1, line: 8, baseType: !6)
+!6 = !DICompositeType(tag: DW_TAG_structure_type, name: "str_type", file: !1, line: 3, size: 96, align: 32, elements: !7)
+!7 = !{!8, !10, !12}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !6, file: !1, line: 5, baseType: !9, size: 32, align: 32)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 32, align: 32)
+!10 = !DIDerivedType(tag: DW_TAG_member, name: "set", scope: !6, file: !1, line: 6, baseType: !11, size: 32, align: 32, offset: 32)
+!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "get", scope: !6, file: !1, line: 7, baseType: !11, size: 32, align: 32, offset: 64)
+!14 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 10, type: !15, isLocal: false, isDefinition: true, scopeLine: 10, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{!11, !9}
+!17 = !{!18, !19, !20}
+!18 = !DILocalVariable(name: "b", arg: 1, scope: !14, file: !1, line: 10, type: !9)
+!19 = !DILocalVariable(name: "res", scope: !14, file: !1, line: 11, type: !11)
+!20 = !DILocalVariable(name: "ee", scope: !14, file: !1, line: 12, type: !4)
+!21 = distinct !DISubprogram(name: "bar1", scope: !1, file: !1, line: 24, type: !15, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !22)
+!22 = !{!23, !24, !25}
+!23 = !DILocalVariable(name: "b", arg: 1, scope: !21, file: !1, line: 24, type: !9)
+!24 = !DILocalVariable(name: "res", scope: !21, file: !1, line: 25, type: !11)
+!25 = !DILocalVariable(name: "ee", scope: !21, file: !1, line: 26, type: !4)
+!26 = !{i32 2, !"Dwarf Version", i32 4}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.9.0)"}
+!31 = !{!32, !32, i64 0}
+!32 = !{!"any pointer", !33, i64 0}
+!33 = !{!"omnipotent char", !34, i64 0}
+!34 = !{!"Simple C/C++ TBAA"}
+!35 = !DIExpression()
+!36 = !DILocation(line: 10, column: 23, scope: !14)
+!37 = !DILocation(line: 11, column: 3, scope: !14)
+!38 = !DILocation(line: 11, column: 7, scope: !14)
+!39 = !DILocation(line: 12, column: 3, scope: !14)
+!40 = !DILocation(line: 12, column: 13, scope: !14)
+!41 = !DILocation(line: 12, column: 31, scope: !14)
+!42 = !DILocation(line: 12, column: 19, scope: !14)
+!43 = !DILocation(line: 14, column: 8, scope: !44)
+!44 = distinct !DILexicalBlock(scope: !14, file: !1, line: 14, column: 7)
+!45 = !DILocation(line: 14, column: 12, scope: !44)
+!46 = !{!47, !48, i64 4}
+!47 = !{!"str_type", !32, i64 0, !48, i64 4, !48, i64 8}
+!48 = !{!"int", !33, i64 0}
+!49 = !DILocation(line: 14, column: 7, scope: !14)
+!50 = !DILocation(line: 15, column: 5, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !44, file: !1, line: 14, column: 18)
+!52 = !DILocation(line: 15, column: 9, scope: !51)
+!53 = !DILocation(line: 15, column: 13, scope: !51)
+!54 = !DILocation(line: 17, column: 16, scope: !51)
+!55 = !DILocation(line: 17, column: 20, scope: !51)
+!56 = !{!47, !32, i64 0}
+!57 = !DILocation(line: 17, column: 11, scope: !51)
+!58 = !DILocation(line: 17, column: 9, scope: !51)
+!59 = !{!48, !48, i64 0}
+!60 = !DILocation(line: 18, column: 9, scope: !61)
+!61 = distinct !DILexicalBlock(scope: !51, file: !1, line: 18, column: 9)
+!62 = !DILocation(line: 18, column: 9, scope: !51)
+!63 = !DILocation(line: 19, column: 7, scope: !61)
+!64 = !DILocation(line: 20, column: 3, scope: !51)
+!65 = !DILocation(line: 21, column: 3, scope: !14)
+!66 = !DILocation(line: 22, column: 1, scope: !14)
+!67 = !DILocation(line: 24, column: 24, scope: !21)
+!68 = !DILocation(line: 25, column: 3, scope: !21)
+!69 = !DILocation(line: 25, column: 7, scope: !21)
+!70 = !DILocation(line: 26, column: 3, scope: !21)
+!71 = !DILocation(line: 26, column: 13, scope: !21)
+!72 = !DILocation(line: 26, column: 31, scope: !21)
+!73 = !DILocation(line: 26, column: 19, scope: !21)
+!74 = !DILocation(line: 28, column: 8, scope: !75)
+!75 = distinct !DILexicalBlock(scope: !21, file: !1, line: 28, column: 7)
+!76 = !DILocation(line: 28, column: 12, scope: !75)
+!77 = !{!47, !48, i64 8}
+!78 = !DILocation(line: 28, column: 7, scope: !21)
+!79 = !DILocation(line: 29, column: 5, scope: !80)
+!80 = distinct !DILexicalBlock(scope: !75, file: !1, line: 28, column: 17)
+!81 = !DILocation(line: 29, column: 9, scope: !80)
+!82 = !DILocation(line: 29, column: 13, scope: !80)
+!83 = !DILocation(line: 31, column: 16, scope: !80)
+!84 = !DILocation(line: 31, column: 20, scope: !80)
+!85 = !DILocation(line: 31, column: 11, scope: !80)
+!86 = !DILocation(line: 31, column: 9, scope: !80)
+!87 = !DILocation(line: 32, column: 9, scope: !88)
+!88 = distinct !DILexicalBlock(scope: !80, file: !1, line: 32, column: 9)
+!89 = !DILocation(line: 32, column: 9, scope: !80)
+!90 = !DILocation(line: 33, column: 7, scope: !88)
+!91 = !DILocation(line: 34, column: 3, scope: !80)
+!92 = !DILocation(line: 35, column: 3, scope: !21)
+!93 = !DILocation(line: 36, column: 1, scope: !21)
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent-template.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent-template.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent-template.ll
@@ -0,0 +1,138 @@
+; RUN: opt -S -mergesimilarfunc %s -o - | FileCheck %s
+
+; CHECK:      define linkonce_odr void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* nocapture readnone %this, %struct.FooStruct* %arg0) #1 align 2 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %field27 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0, i32 0, i32 2
+; CHECK-NEXT:   %0 = load i8, i8* %field27, align 1
+; CHECK-NEXT:   %lnot8 = icmp eq i8 %0, 0
+; CHECK-NEXT:   br i1 %lnot8, label %for.body, label %for.end
+
+; CHECK:      for.body:                                         ; preds = %for.body, %entry
+; CHECK-NEXT:   %arg0.addr.09 = phi %struct.FooStruct* [ %2, %for.body ], [ %arg0, %entry ]
+; CHECK-NEXT:   %field1 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 1
+; CHECK-NEXT:   %1 = load %struct.FooStruct*, %struct.FooStruct** %field1, align 4
+; CHECK-NEXT:   tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* %this, %struct.FooStruct* %1) #2
+; CHECK-NEXT:   %field0 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 0
+; CHECK-NEXT:   %2 = load %struct.FooStruct*, %struct.FooStruct** %field0, align 4
+; CHECK-NEXT:   %3 = bitcast %struct.FooStruct* %arg0.addr.09 to i8*
+; CHECK-NEXT:   tail call void @_Z4bar0Pv(i8* %3) #3
+; CHECK-NEXT:   tail call void @_Z4bar1Pv(i8* %3) #3
+; CHECK-NEXT:   %field2 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %2, i32 0, i32 2
+; CHECK-NEXT:   %4 = load i8, i8* %field2, align 1
+; CHECK-NEXT:   %lnot = icmp eq i8 %4, 0
+; CHECK-NEXT:   br i1 %lnot, label %for.body, label %for.end
+
+; CHECK:      for.end:                                          ; preds = %for.body, %entry
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+; CHECK:      define linkonce_odr void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* nocapture readnone, %struct.FooStruct*) #1 align 2 {
+; CHECK-NEXT:   %3 = bitcast %class.FooTemplate* %0 to %class.FooTemplate.0*
+; CHECK-NEXT:   tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* nocapture readnone %3, %struct.FooStruct* %1)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+; CHECK:      attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; CHECK-NEXT: attributes #1 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; CHECK-NEXT: attributes #2 = { optsize }
+; CHECK-NEXT: attributes #3 = { nounwind optsize }
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+
+%class.Foo = type { %class.FooTemplate, %class.FooTemplate.0 }
+%class.FooTemplate = type { i8 }
+%class.FooTemplate.0 = type { i8 }
+%struct.FooStruct = type { %struct.FooStruct*, %struct.FooStruct*, i8 }
+
+@_ZN3FooD1Ev = alias void(%class.Foo*), void (%class.Foo*)* @_ZN3FooD2Ev
+
+declare %struct.FooStruct* @_ZNK11FooTemplateIPvE7method2Ev(%class.FooTemplate.0*) #1
+declare void @_Z4bar0Pv(i8*) #1
+declare void @_Z4bar1Pv(i8*) #1
+declare %struct.FooStruct* @_ZNK11FooTemplateIiE7method2Ev(%class.FooTemplate*) #1
+
+; Function Attrs: nounwind optsize ssp
+define void @_ZN3FooD2Ev(%class.Foo* %this) unnamed_addr #0 align 2 {
+entry:
+  %bar_things = getelementptr inbounds %class.Foo, %class.Foo* %this, i32 0, i32 0
+  tail call void @_ZN11FooTemplateIiE7method0Ev(%class.FooTemplate* %bar_things) #2
+  %baz_things = getelementptr inbounds %class.Foo, %class.Foo* %this, i32 0, i32 1
+  tail call void @_ZN11FooTemplateIPvE7method0Ev(%class.FooTemplate.0* %baz_things) #2
+  ret void
+}
+
+; Function Attrs: nounwind optsize ssp
+define linkonce_odr void @_ZN11FooTemplateIiE7method0Ev(%class.FooTemplate* %this) #0 align 2 {
+entry:
+  %call = tail call %struct.FooStruct* @_ZNK11FooTemplateIiE7method2Ev(%class.FooTemplate* %this) #3
+  tail call void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* %this, %struct.FooStruct* %call) #2
+  ret void
+}
+
+; Function Attrs: nounwind optsize ssp
+define linkonce_odr void @_ZN11FooTemplateIPvE7method0Ev(%class.FooTemplate.0* %this) #0 align 2 {
+entry:
+  %call = tail call %struct.FooStruct* @_ZNK11FooTemplateIPvE7method2Ev(%class.FooTemplate.0* %this) #3
+  tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* %this, %struct.FooStruct* %call) #2
+  ret void
+}
+
+; Function Attrs: nounwind optsize ssp
+define linkonce_odr void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* nocapture readnone %this, %struct.FooStruct* %arg0) #0 align 2 {
+entry:
+  %field27 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0, i32 0, i32 2
+  %0 = load i8, i8* %field27, align 1
+  %lnot8 = icmp eq i8 %0, 0
+  br i1 %lnot8, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %arg0.addr.09 = phi %struct.FooStruct* [ %2, %for.body ], [ %arg0, %entry ]
+  %field1 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 1
+  %1 = load %struct.FooStruct*, %struct.FooStruct** %field1, align 4
+  tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* %this, %struct.FooStruct* %1) #2
+  %field0 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 0
+  %2 = load %struct.FooStruct*, %struct.FooStruct** %field0, align 4
+  %3 = bitcast %struct.FooStruct* %arg0.addr.09 to i8*
+  tail call void @_Z4bar0Pv(i8* %3) #3
+  tail call void @_Z4bar1Pv(i8* %3) #3
+  %field2 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %2, i32 0, i32 2
+  %4 = load i8, i8* %field2, align 1
+  %lnot = icmp eq i8 %4, 0
+  br i1 %lnot, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Function Attrs: nounwind optsize ssp
+define linkonce_odr void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* nocapture readnone %this, %struct.FooStruct* %arg0) #0 align 2 {
+entry:
+  %field27 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0, i32 0, i32 2
+  %0 = load i8, i8* %field27, align 1
+  %lnot8 = icmp eq i8 %0, 0
+  br i1 %lnot8, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %arg0.addr.09 = phi %struct.FooStruct* [ %2, %for.body ], [ %arg0, %entry ]
+  %field1 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 1
+  %1 = load %struct.FooStruct*, %struct.FooStruct** %field1, align 4
+  tail call void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* %this, %struct.FooStruct* %1) #2
+  %field0 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 0
+  %2 = load %struct.FooStruct*, %struct.FooStruct** %field0, align 4
+  %3 = bitcast %struct.FooStruct* %arg0.addr.09 to i8*
+  tail call void @_Z4bar0Pv(i8* %3) #3
+  tail call void @_Z4bar1Pv(i8* %3) #3
+  %field2 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %2, i32 0, i32 2
+  %4 = load i8, i8* %field2, align 1
+  %lnot = icmp eq i8 %4, 0
+  br i1 %lnot, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { optsize }
+attributes #3 = { nounwind optsize }
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent.ll
@@ -0,0 +1,226 @@
+; RUN: opt -mergesimilarfunc -S %s -o - | FileCheck %s
+;
+; CHECK:      define i8* @foo_a(%struct.a_type* %arg0) #1 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %for.cond
+
+; CHECK:      for.cond:                                         ; preds = %for.inc24, %entry
+; CHECK-NEXT:   %i.0 = phi i8 [ 0, %entry ], [ %inc25, %for.inc24 ]
+; CHECK-NEXT:   %ptr0.0 = phi i8* [ null, %entry ], [ %ptr0.3, %for.inc24 ]
+; CHECK-NEXT:   %conv = zext i8 %i.0 to i32
+; CHECK-NEXT:   %cmp = icmp slt i32 %conv, 16
+; CHECK-NEXT:   br i1 %cmp, label %for.body, label %for.end26
+
+; CHECK:      for.body:                                         ; preds = %for.cond
+; CHECK-NEXT:   %call = call i8* @bar4(i32 %conv) #2
+; CHECK-NEXT:   %call4 = call i8* @bar0(i32 %conv) #2
+; CHECK-NEXT:   %0 = bitcast i8* %call4 to %struct.a_type*
+; CHECK-NEXT:   %call5 = call i32 @bar1(i8* %call) #2
+; CHECK-NEXT:   %tobool = icmp ne i32 %call5, 0
+; CHECK-NEXT:   br i1 %tobool, label %for.cond6, label %for.inc24
+
+; CHECK:      for.cond6:                                        ; preds = %for.inc, %for.body
+; CHECK-NEXT:   %k.0 = phi i8 [ %inc, %for.inc ], [ 0, %for.body ]
+; CHECK-NEXT:   %ptr0.1 = phi i8* [ %ptr0.2, %for.inc ], [ %call, %for.body ]
+; CHECK-NEXT:   %idxprom = zext i8 %k.0 to i32
+; CHECK-NEXT:   %field4 = getelementptr inbounds %struct.a_type, %struct.a_type* %arg0, i32 0, i32 4
+; CHECK-NEXT:   %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* %field4, i32 0, i32 %idxprom
+; CHECK-NEXT:   %1 = load i8*, i8** %arrayidx, align 4
+; CHECK-NEXT:   %cmp7 = icmp ne i8* %1, null
+; CHECK-NEXT:   br i1 %cmp7, label %land.rhs, label %for.inc24
+
+; CHECK:      land.rhs:                                         ; preds = %for.cond6
+; CHECK-NEXT:   %field410 = getelementptr inbounds %struct.a_type, %struct.a_type* %0, i32 0, i32 4
+; CHECK-NEXT:   %arrayidx11 = getelementptr inbounds [2 x i8*], [2 x i8*]* %field410, i32 0, i32 %idxprom
+; CHECK-NEXT:   %2 = load i8*, i8** %arrayidx11, align 4
+; CHECK-NEXT:   %cmp12 = icmp ne i8* %2, null
+; CHECK-NEXT:   br i1 %cmp12, label %for.body14, label %for.inc24
+
+; CHECK:      for.body14:                                       ; preds = %land.rhs
+; CHECK-NEXT:   %3 = bitcast %struct.a_type* %0 to i8*
+; CHECK-NEXT:   %4 = bitcast %struct.a_type* %arg0 to i8*
+; CHECK-NEXT:   %call15 = call i32 @bar2(i8* %3, i8* %4) #2
+; CHECK-NEXT:   %tobool16 = icmp ne i32 %call15, 0
+; CHECK-NEXT:   br i1 %tobool16, label %if.then17, label %for.inc
+
+; CHECK:      if.then17:                                        ; preds = %for.body14
+; CHECK-NEXT:   %call18 = call i32 @bar3(i8* %3, i8* %4) #2
+; CHECK-NEXT:   %tobool19 = icmp ne i32 %call18, 0
+; CHECK-NEXT:   br i1 %tobool19, label %if.then20, label %for.inc
+
+; CHECK:      if.then20:                                        ; preds = %if.then17
+; CHECK-NEXT:   br label %for.inc
+
+; CHECK:      for.inc:                                          ; preds = %if.then20, %if.then17, %for.body14
+; CHECK-NEXT:   %ptr0.2 = phi i8* [ null, %if.then20 ], [ %ptr0.1, %if.then17 ], [ null, %for.body14 ]
+; CHECK-NEXT:   %inc = add i8 %k.0, 1
+; CHECK-NEXT:   br label %for.cond6
+
+; CHECK:      for.inc24:                                        ; preds = %land.rhs, %for.cond6, %for.body
+; CHECK-NEXT:   %ptr0.3 = phi i8* [ %ptr0.1, %land.rhs ], [ %ptr0.1, %for.cond6 ], [ null, %for.body ]
+; CHECK-NEXT:   %inc25 = add i8 %i.0, 1
+; CHECK-NEXT:   br label %for.cond
+
+; CHECK:      for.end26:                                        ; preds = %for.cond
+; CHECK-NEXT:   ret i8* %ptr0.0
+; CHECK-NEXT: }
+
+; CHECK:      ; Function Attrs: nounwind optsize ssp
+; CHECK-NEXT: define i8* @foo_b(%struct.b_type*) #1 {
+; CHECK-NEXT:   %2 = bitcast %struct.b_type* %0 to %struct.a_type*
+; CHECK-NEXT:   %3 = tail call i8* @foo_a(%struct.a_type* %2)
+; CHECK-NEXT:   ret i8* %3
+; CHECK-NEXT: }
+
+; CHECK:      attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; CHECK-NEXT: attributes #1 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; CHECK-NEXT: attributes #2 = { optsize }
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+
+%struct.a_type = type { i32, i32, i32, i8*, [2 x i8*] }
+%struct.b_type = type { i32, i32, i32, i8*, [2 x i8*], i32 }
+
+; Function Attrs: optsize
+declare i8* @bar4(i32) #1
+declare i8* @bar0(i32) #1
+declare i32 @bar1(i8*) #1
+declare i32 @bar2(i8*, i8*) #1
+declare i32 @bar3(i8*, i8*) #1
+
+; Function Attrs: nounwind optsize ssp
+define i8* @foo_a(%struct.a_type* %arg0) #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc24, %entry
+  %i.0 = phi i8 [ 0, %entry ], [ %inc25, %for.inc24 ]
+  %ptr0.0 = phi i8* [ null, %entry ], [ %ptr0.3, %for.inc24 ]
+  %conv = zext i8 %i.0 to i32
+  %cmp = icmp slt i32 %conv, 16
+  br i1 %cmp, label %for.body, label %for.end26
+
+for.body:                                         ; preds = %for.cond
+  %call = call i8* @bar4(i32 %conv) #2
+  %call4 = call i8* @bar0(i32 %conv) #2
+  %0 = bitcast i8* %call4 to %struct.a_type*
+  %call5 = call i32 @bar1(i8* %call) #2
+  %tobool = icmp ne i32 %call5, 0
+  br i1 %tobool, label %for.cond6, label %for.inc24
+
+for.cond6:                                        ; preds = %for.body, %for.inc
+  %k.0 = phi i8 [ %inc, %for.inc ], [ 0, %for.body ]
+  %ptr0.1 = phi i8* [ %ptr0.2, %for.inc ], [ %call, %for.body ]
+  %idxprom = zext i8 %k.0 to i32
+  %field4 = getelementptr inbounds %struct.a_type, %struct.a_type* %arg0, i32 0, i32 4
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* %field4, i32 0, i32 %idxprom
+  %1 = load i8*, i8** %arrayidx, align 4
+  %cmp7 = icmp ne i8* %1, null
+  br i1 %cmp7, label %land.rhs, label %for.inc24
+
+land.rhs:                                         ; preds = %for.cond6
+  %field410 = getelementptr inbounds %struct.a_type, %struct.a_type* %0, i32 0, i32 4
+  %arrayidx11 = getelementptr inbounds [2 x i8*], [2 x i8*]* %field410, i32 0, i32 %idxprom
+  %2 = load i8*, i8** %arrayidx11, align 4
+  %cmp12 = icmp ne i8* %2, null
+  br i1 %cmp12, label %for.body14, label %for.inc24
+
+for.body14:                                       ; preds = %land.rhs
+  %3 = bitcast %struct.a_type* %0 to i8*
+  %4 = bitcast %struct.a_type* %arg0 to i8*
+  %call15 = call i32 @bar2(i8* %3, i8* %4) #2
+  %tobool16 = icmp ne i32 %call15, 0
+  br i1 %tobool16, label %if.then17, label %for.inc
+
+if.then17:                                        ; preds = %for.body14
+  %call18 = call i32 @bar3(i8* %3, i8* %4) #2
+  %tobool19 = icmp ne i32 %call18, 0
+  br i1 %tobool19, label %if.then20, label %for.inc
+
+if.then20:                                        ; preds = %if.then17
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body14, %if.then17, %if.then20
+  %ptr0.2 = phi i8* [ null, %if.then20 ], [ %ptr0.1, %if.then17 ], [ null, %for.body14 ]
+  %inc = add i8 %k.0, 1
+  br label %for.cond6
+
+for.inc24:                                        ; preds = %for.body, %for.cond6, %land.rhs
+  %ptr0.3 = phi i8* [ %ptr0.1, %land.rhs ], [ %ptr0.1, %for.cond6 ], [ null, %for.body ]
+  %inc25 = add i8 %i.0, 1
+  br label %for.cond
+
+for.end26:                                        ; preds = %for.cond
+  ret i8* %ptr0.0
+}
+
+; Function Attrs: nounwind optsize ssp
+define i8* @foo_b(%struct.b_type* %arg0) #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc24, %entry
+  %i.0 = phi i8 [ 0, %entry ], [ %inc25, %for.inc24 ]
+  %ptr0.0 = phi i8* [ null, %entry ], [ %ptr0.3, %for.inc24 ]
+  %conv = zext i8 %i.0 to i32
+  %cmp = icmp slt i32 %conv, 16
+  br i1 %cmp, label %for.body, label %for.end26
+
+for.body:                                         ; preds = %for.cond
+  %call = call i8* @bar4(i32 %conv) #2
+  %call4 = call i8* @bar0(i32 %conv) #2
+  %0 = bitcast i8* %call4 to %struct.b_type*
+  %call5 = call i32 @bar1(i8* %call) #2
+  %tobool = icmp ne i32 %call5, 0
+  br i1 %tobool, label %for.cond6, label %for.inc24
+
+for.cond6:                                        ; preds = %for.body, %for.inc
+  %k.0 = phi i8 [ %inc, %for.inc ], [ 0, %for.body ]
+  %ptr0.1 = phi i8* [ %ptr0.2, %for.inc ], [ %call, %for.body ]
+  %idxprom = zext i8 %k.0 to i32
+  %field4 = getelementptr inbounds %struct.b_type, %struct.b_type* %arg0, i32 0, i32 4
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* %field4, i32 0, i32 %idxprom
+  %1 = load i8*, i8** %arrayidx, align 4
+  %cmp7 = icmp ne i8* %1, null
+  br i1 %cmp7, label %land.rhs, label %for.inc24
+
+land.rhs:                                         ; preds = %for.cond6
+  %field410 = getelementptr inbounds %struct.b_type, %struct.b_type* %0, i32 0, i32 4
+  %arrayidx11 = getelementptr inbounds [2 x i8*], [2 x i8*]* %field410, i32 0, i32 %idxprom
+  %2 = load i8*, i8** %arrayidx11, align 4
+  %cmp12 = icmp ne i8* %2, null
+  br i1 %cmp12, label %for.body14, label %for.inc24
+
+for.body14:                                       ; preds = %land.rhs
+  %3 = bitcast %struct.b_type* %0 to i8*
+  %4 = bitcast %struct.b_type* %arg0 to i8*
+  %call15 = call i32 @bar2(i8* %3, i8* %4) #2
+  %tobool16 = icmp ne i32 %call15, 0
+  br i1 %tobool16, label %if.then17, label %for.inc
+
+if.then17:                                        ; preds = %for.body14
+  %call18 = call i32 @bar3(i8* %3, i8* %4) #2
+  %tobool19 = icmp ne i32 %call18, 0
+  br i1 %tobool19, label %if.then20, label %for.inc
+
+if.then20:                                        ; preds = %if.then17
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body14, %if.then17, %if.then20
+  %ptr0.2 = phi i8* [ null, %if.then20 ], [ %ptr0.1, %if.then17 ], [ null, %for.body14 ]
+  %inc = add i8 %k.0, 1
+  br label %for.cond6
+
+for.inc24:                                        ; preds = %for.body, %for.cond6, %land.rhs
+  %ptr0.3 = phi i8* [ %ptr0.1, %land.rhs ], [ %ptr0.1, %for.cond6 ], [ null, %for.body ]
+  %inc25 = add i8 %i.0, 1
+  br label %for.cond
+
+for.end26:                                        ; preds = %for.cond
+  ret i8* %ptr0.0
+}
+
+attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { optsize }
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-noinline.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-noinline.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-noinline.ll
@@ -0,0 +1,25 @@
+; RUN: opt -S -mergesimilarfunc -mergesimilarfunc-level=all < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @stuff()
+
+; CHECK-LABEL: @f0(
+define void @f0(i64 %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
+; CHECK-LABEL: @f1(
+; CHECK: call void @f0{{.*}} #[[ATTR:[0-9]+]]
+; CHECK: attributes #[[ATTR]] = { {{.*}}noinline
+define void @f1(i64 %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-ret.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-ret.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-ret.ll
@@ -0,0 +1,207 @@
+; Check that the ret instructions are merged correctly. A bug caused
+; an incorrect merge and a verifier failure for this input.
+;
+; RUN: opt -S -mergesimilarfunc < %s | FileCheck %s
+;
+; CHECK-LABEL: define internal %0* @LLVMGetReturnType__merged
+; CHECK: phi %0* [
+; CHECK-NEXT: ret %0*
+; CHECK-NEXT: }
+;
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+
+%0 = type opaque
+%1 = type { %2*, i32, i32, %1** }
+%2 = type { %3* }
+%3 = type opaque
+%4 = type { %1 }
+%5 = type opaque
+%6 = type { i32 (...)**, %1*, %7*, i8, i8, i16, i32 }
+%7 = type { %6*, %7*, %8 }
+%8 = type { i32 }
+%9 = type { %10, %1*, i32, %12, i32, i8, %18* }
+%10 = type { %11 }
+%11 = type { %6 }
+%12 = type { %13 }
+%13 = type { %14 }
+%14 = type { %15 }
+%15 = type { %16 }
+%16 = type { %17 }
+%17 = type { i32, i32, i8* }
+%18 = type <{ %2*, %19, %30, %70, %79, %87, %12, %93*, %94, %98, %12, %12, %12, i8*, %102, i8, [3 x i8] }>
+%19 = type { %20 }
+%20 = type { %21, %26* }
+%21 = type { %22 }
+%22 = type { %23 }
+%23 = type { %24 }
+%24 = type { %25, %26* }
+%25 = type { %26* }
+%26 = type <{ %27, [3 x i8], %24, i8, [3 x i8] }>
+%27 = type <{ %9, %12, %28*, %12, %12, i8 }>
+%28 = type <{ %29*, i8, [3 x i8] }>
+%29 = type opaque
+%30 = type { %31 }
+%31 = type { %32, %37* }
+%32 = type { %33 }
+%33 = type { %34 }
+%34 = type { %35 }
+%35 = type { %36, %37* }
+%36 = type { %37* }
+%37 = type { %27, %35, %38, %60, %93*, %68, %4* }
+%38 = type { %39 }
+%39 = type { %40, %44* }
+%40 = type { %41 }
+%41 = type { %42 }
+%42 = type { %43 }
+%43 = type { %44* }
+%44 = type { %6, %45, i32, %47, %37* }
+%45 = type { %46 }
+%46 = type { %43, %44* }
+%47 = type { %48 }
+%48 = type { %49, %53* }
+%49 = type { %50 }
+%50 = type { %51 }
+%51 = type { %52 }
+%52 = type { %53* }
+%53 = type { %11, %54, %44*, %56 }
+%54 = type { %55 }
+%55 = type { %52, %53* }
+%56 = type { %57 }
+%57 = type { %58 }
+%58 = type { %59* }
+%59 = type { i8, i8, i16, i32 }
+%60 = type { %61 }
+%61 = type { %62, %66* }
+%62 = type { %63 }
+%63 = type { %64 }
+%64 = type { %65 }
+%65 = type { %66* }
+%66 = type { %6, %67, %37* }
+%67 = type { %65, %66* }
+%68 = type { %69* }
+%69 = type opaque
+%70 = type { %71 }
+%71 = type { %72, %77* }
+%72 = type { %73 }
+%73 = type { %74 }
+%74 = type { %75 }
+%75 = type { %76, %77* }
+%76 = type { %77* }
+%77 = type { %78, %75 }
+%78 = type { %9 }
+%79 = type { %80 }
+%80 = type { %81, %86* }
+%81 = type { %82 }
+%82 = type { %83 }
+%83 = type { %84 }
+%84 = type { %85, %86* }
+%85 = type { %86* }
+%86 = type { %78, %84 }
+%87 = type { %88 }
+%88 = type { %89, %92* }
+%89 = type { %90 }
+%90 = type { %91, %92* }
+%91 = type { %92* }
+%92 = type { %90, %12, %18*, i8* }
+%93 = type opaque
+%94 = type <{ %95, %97, [3 x i8] }>
+%95 = type { %96**, i32, i32, i32, i32 }
+%96 = type { i32 }
+%97 = type { i8 }
+%98 = type { %99 }
+%99 = type { %100 }
+%100 = type { %101* }
+%101 = type opaque
+%102 = type { i8, i32, i8, [3 x i8], %103, %111, %12, %118, i8* }
+%103 = type { %104, %110 }
+%104 = type { %105 }
+%105 = type { %106 }
+%106 = type <{ %107, %108 }>
+%107 = type { i8*, i8*, i8* }
+%108 = type { %109 }
+%109 = type { [1 x i8] }
+%110 = type { [7 x %108] }
+%111 = type { %112, %117 }
+%112 = type { %113 }
+%113 = type { %114 }
+%114 = type { %107, %115 }
+%115 = type { %116 }
+%116 = type { [8 x i8] }
+%117 = type { [15 x %115] }
+%118 = type { %119, %124 }
+%119 = type { %120 }
+%120 = type { %121 }
+%121 = type { %107, %122 }
+%122 = type { %123 }
+%123 = type { [16 x i8] }
+%124 = type { [7 x %122] }
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: optsize
+define %0* @LLVMGetReturnType(%0*) #1 {
+  %2 = alloca %1*, align 4
+  %3 = bitcast %1** %2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %3)
+  %4 = bitcast %1** %2 to %0**
+  store %0* %0, %0** %4, align 4
+  %5 = call zeroext i1 @_ZN4llvm13isa_impl_wrapINS_12FunctionTypeEKPNS_4TypeEPKS2_E4doitERS4_(%1** nonnull dereferenceable(4) %2) #3
+  %6 = bitcast %0* %0 to %4*
+  br i1 %5, label %8, label %7
+
+; <label>:7:                                      ; preds = %1
+  tail call void @__assert_fail.5.0.1.2_i32_237.3() #4
+  br label %8
+
+; <label>:8:                                      ; preds = %7, %1
+  call void @llvm.lifetime.end(i64 4, i8* %3)
+  %9 = getelementptr inbounds %4, %4* %6, i32 0, i32 0, i32 3
+  %10 = bitcast %1*** %9 to %0***
+  %11 = load %0**, %0*** %10, align 4
+  %12 = load %0*, %0** %11, align 4
+  ret %0* %12
+}
+
+; Function Attrs: optsize
+declare void @__assert_fail.5.0.1.2_i32_237.3() #2
+
+; Function Attrs: optsize
+define i32 @LLVMHasUnnamedAddr(%5*) #1 {
+  %2 = alloca %6*, align 4
+  %3 = bitcast %6** %2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %3)
+  %4 = bitcast %6** %2 to %5**
+  store %5* %0, %5** %4, align 4
+  %5 = call zeroext i1 @_ZN4llvm13isa_impl_wrapINS_11GlobalValueEKPNS_5ValueEPKS2_E4doitERS4_(%6** nonnull dereferenceable(4) %2) #3
+  %6 = bitcast %5* %0 to %9*
+  br i1 %5, label %8, label %7
+
+; <label>:7:                                      ; preds = %1
+  tail call void @__assert_fail.5.0.1.2_i32_237.3() #4
+  br label %8
+
+; <label>:8:                                      ; preds = %7, %1
+  call void @llvm.lifetime.end(i64 4, i8* %3)
+  %9 = getelementptr inbounds %9, %9* %6, i32 0, i32 2
+  %10 = load i32, i32* %9, align 4
+  %11 = lshr i32 %10, 6
+  %12 = and i32 %11, 1
+  ret i32 %12
+}
+
+; Function Attrs: optsize
+declare zeroext i1 @_ZN4llvm13isa_impl_wrapINS_12FunctionTypeEKPNS_4TypeEPKS2_E4doitERS4_(%1** nocapture readonly dereferenceable(4)) #1 align 2
+
+; Function Attrs: optsize
+declare zeroext i1 @_ZN4llvm13isa_impl_wrapINS_11GlobalValueEKPNS_5ValueEPKS2_E4doitERS4_(%6** nocapture readonly dereferenceable(4)) #1 align 2
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { optsize }
+attributes #2 = { optsize }
+attributes #3 = { optsize }
+attributes #4 = { noinline optsize }
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-ret2.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-ret2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-ret2.ll
@@ -0,0 +1,147 @@
+; Check that two ret instructions with the same return type
+; are generated in the merged function. A bug caused a mismatch
+; in return types here and a verifier failure.
+;
+; RUN: opt -S -mergesimilarfunc < %s | FileCheck %s
+;
+; CHECK: @mrg1__merged
+; CHECK: switch i32 %__merge_arg
+; CHECK: ret %0*
+; CHECK: ret %0*
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+
+%0 = type { {}*, %0* (%8*)*, %0* (%13*)*, %0* (%39*)*, %0* (%14*)*, %0* (%14*)*, %0* (%15*)*, %0* (%16*)*, %0* ()*, %0* (%17*)*, %0* (%22*)*, %0* (%28*)*, %0* (%32*)*, %0* (%37*)*, %0* (%37*)*, %0* (%37*)*, %0* ()*, %0* (%33*)*, %0* (%36*)*, %0* (%37*)*, %0* (%38*)*, %0* ()*, %0* (%39*)*, %0* ()*, %0* ()*, %0* (%40*)*, %0* ()*, %0* ()*, %0* ()*, %0* ()*, i8 ()*, void ()* }
+%1 = type { %2, i32, i8, i8, i64, i8, i8, i8, %3, %3, i8, i8 }
+%2 = type { i8, i8, i8, i8 }
+%3 = type { i32, i32, [10 x %4], %7 }
+%4 = type { %5, i8, i32, i8, %5 }
+%5 = type { %6 }
+%6 = type { i64, i64, i64, i64 }
+%7 = type { i8, i8 }
+%8 = type { %2, i32, i8, i8, i8, i8, i8, i64, %9, %3, i8, i8, i8, %12, i32, i8 }
+%9 = type { %10 }
+%10 = type { i8, i8, %11, i32, i8 }
+%11 = type { [3 x i8] }
+%12 = type { i8, i8, i8 }
+%13 = type { %2, i32, i8, i32, i8, i8, i8 }
+%14 = type { %2, i32, i8, i8, i8 }
+%15 = type { %2, i8, i8, i8 }
+%16 = type { %2, i8, %5, i8, %11, i16, i8 }
+%17 = type { %2, %18, i8 }
+%18 = type { i8, i8, i8, i8, i8, i8, i8, i8, %11, %19, %19, %20, i32, %21, i8, i16, i8 }
+%19 = type { i8, i8, i8, [255 x i8], i8 }
+%20 = type { i8, i8, i8, i8, i8, i8, i32 }
+%21 = type { i8, [3 x i8] }
+%22 = type { %2, i8, i8, %23, i8, i8, i8 }
+%23 = type { %24 }
+%24 = type { i32, [40 x %25] }
+%25 = type { %11, i8, i8, i8, i8, i8, i32, %26, i32 }
+%26 = type { i32, %27, i8 }
+%27 = type { i8, [48 x i8] }
+%28 = type { %2, i8, %29, i8, %24, i8, %31, i8, i8, i8, i8, i16, i8 }
+%29 = type { i8, %11, i8, i8, i16, i8, i8, i8, %5, i16, i16, i8, %26, i8, i8, i8, i32, i8, i8, i8, i8, %30, i8, i8, %11, i8 }
+%30 = type { i8, i8, i8, i8, i8 }
+%31 = type { i32, [40 x %11] }
+%32 = type { %2, i8, %29, i8, i8, %31, i8, i16, i8 }
+%33 = type { %2, %34, i8 }
+%34 = type { i8, [16 x %35] }
+%35 = type { i8, i8, [41 x i8] }
+%36 = type { %2, i32, i8 }
+%37 = type { %2, i8 }
+%38 = type { %2, i8, i8, %11, i16, i8, i32, i8 }
+%39 = type { %2, i32, i8, i8 }
+%40 = type { %2, i8, %11, i32, i8, i8, i8 }
+%41 = type { {}*, %41* (%8*)*, %41* (%13*)*, %41* (%39*)*, %41* (%14*)*, %41* (%14*)*, %41* (%15*)*, %41* (%16*)*, %41* ()*, %41* (%17*)*, %41* (%22*)*, %41* (%28*)*, %41* (%32*)*, %41* (%37*)*, %41* (%37*)*, %41* (%37*)*, %41* ()*, %41* (%33*)*, %41* (%36*)*, %41* (%37*)*, %41* (%38*)*, %41* ()*, %41* (%39*)*, %41* ()*, %41* ()*, %41* (%40*)*, %41* ()*, %41* ()*, %41* ()*, %41* ()*, i8 ()*, void ()* }
+
+@xx = external global i8, align 1
+@yy = external global { %0* (%1*)*, %0* (%8*)*, %0* (%13*)*, %0* (%39*)*, %0* (%14*)*, %0* (%14*)*, %0* (%15*)*, %0* (%16*)*, %0* ()*, %0* (%17*)*, %0* (%22*)*, %0* (%28*)*, %0* (%32*)*, %0* (%37*)*, %0* (%37*)*, %0* (%37*)*, %0* ()*, %0* (%33*)*, %0* (%36*)*, %0* (%37*)*, %0* (%38*)*, %0* ()*, %0* (%39*)*, %0* ()*, %0* ()*, %0* (%40*)*, %0* ()*, %0* ()*, %0* ()*, %0* ()*, i8 ()*, void ()* }, align 4
+@zz = external global { %41* (%1*)*, %41* (%8*)*, %41* (%13*)*, %41* (%39*)*, %41* (%14*)*, %41* (%14*)*, %41* (%15*)*, %41* (%16*)*, %41* ()*, %41* (%17*)*, %41* (%22*)*, %41* (%28*)*, %41* (%32*)*, %41* (%37*)*, %41* (%37*)*, %41* (%37*)*, %41* ()*, %41* (%33*)*, %41* (%36*)*, %41* (%37*)*, %41* (%38*)*, %41* ()*, %41* (%39*)*, %41* ()*, %41* ()*, %41* (%40*)*, %41* ()*, %41* ()*, %41* ()*, %41* ()*, i8 ()*, void ()* }, align 4
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @f1()
+declare void @f1a(i8 zeroext)
+declare signext i8 @f2()
+declare void @f3(%26* noalias nocapture sret)
+
+define internal %0* @mrg1() optsize {
+  %1 = alloca %26, align 4
+  %2 = bitcast %26* %1 to i8*
+  call void @llvm.lifetime.start(i64 56, i8* %2)
+  call void @f3(%26* nonnull sret %1)
+  call void @f1a(i8 zeroext 1)
+  %3 = load i8, i8* @xx, align 1
+  %4 = icmp eq i8 %3, 1
+  br i1 %4, label %5, label %18
+
+; <label>:5:                                      ; preds = %0
+  %6 = call signext i8 @f2()
+  %7 = icmp eq i8 %6, 2
+  br i1 %7, label %8, label %18
+
+; <label>:8:                                      ; preds = %5
+  %9 = getelementptr inbounds %26, %26* %1, i32 0, i32 0
+  %10 = load i32, i32* %9, align 4
+  %11 = icmp eq i32 %10, -1
+  br i1 %11, label %16, label %12
+
+; <label>:12:                                     ; preds = %8
+  %13 = getelementptr inbounds %26, %26* %1, i32 0, i32 2
+  %14 = load i8, i8* %13, align 1
+  %15 = icmp eq i8 %14, 1
+  br i1 %15, label %16, label %17
+
+; <label>:16:                                     ; preds = %12, %8
+  call void @f1()
+  br label %17
+
+; <label>:17:                                     ; preds = %16, %12
+  store i8 0, i8* @xx, align 1
+  br label %18
+
+; <label>:18:                                     ; preds = %17, %5, %0
+  call void @llvm.lifetime.end(i64 56, i8* %2)
+  ret %0* bitcast ({ %0* (%1*)*, %0* (%8*)*, %0* (%13*)*, %0* (%39*)*, %0* (%14*)*, %0* (%14*)*, %0* (%15*)*, %0* (%16*)*, %0* ()*, %0* (%17*)*, %0* (%22*)*, %0* (%28*)*, %0* (%32*)*, %0* (%37*)*, %0* (%37*)*, %0* (%37*)*, %0* ()*, %0* (%33*)*, %0* (%36*)*, %0* (%37*)*, %0* (%38*)*, %0* ()*, %0* (%39*)*, %0* ()*, %0* ()*, %0* (%40*)*, %0* ()*, %0* ()*, %0* ()*, %0* ()*, i8 ()*, void ()* }* @yy to %0*)
+}
+
+define internal %41* @mrg2() optsize {
+  %1 = alloca %26, align 4
+  %2 = bitcast %26* %1 to i8*
+  call void @llvm.lifetime.start(i64 56, i8* %2)
+  call void @f3(%26* nonnull sret %1)
+  call void @f1a(i8 zeroext 1)
+  %3 = load i8, i8* @xx, align 1
+  %4 = icmp eq i8 %3, 1
+  br i1 %4, label %5, label %18
+
+; <label>:5:                                      ; preds = %0
+  %6 = call signext i8 @f2()
+  %7 = icmp eq i8 %6, 2
+  br i1 %7, label %8, label %18
+
+; <label>:8:                                      ; preds = %5
+  %9 = getelementptr inbounds %26, %26* %1, i32 0, i32 0
+  %10 = load i32, i32* %9, align 4
+  %11 = icmp eq i32 %10, -1
+  br i1 %11, label %16, label %12
+
+; <label>:12:                                     ; preds = %8
+  %13 = getelementptr inbounds %26, %26* %1, i32 0, i32 2
+  %14 = load i8, i8* %13, align 1
+  %15 = icmp eq i8 %14, 1
+  br i1 %15, label %16, label %17
+
+; <label>:16:                                     ; preds = %12, %8
+  call void @f1()
+  br label %17
+
+; <label>:17:                                     ; preds = %16, %12
+  store i8 0, i8* @xx, align 1
+  br label %18
+
+; <label>:18:                                     ; preds = %17, %5, %0
+  call void @llvm.lifetime.end(i64 56, i8* %2)
+  ret %41* bitcast ({ %41* (%1*)*, %41* (%8*)*, %41* (%13*)*, %41* (%39*)*, %41* (%14*)*, %41* (%14*)*, %41* (%15*)*, %41* (%16*)*, %41* ()*, %41* (%17*)*, %41* (%22*)*, %41* (%28*)*, %41* (%32*)*, %41* (%37*)*, %41* (%37*)*, %41* (%37*)*, %41* ()*, %41* (%33*)*, %41* (%36*)*, %41* (%37*)*, %41* (%38*)*, %41* ()*, %41* (%39*)*, %41* ()*, %41* ()*, %41* (%40*)*, %41* ()*, %41* ()*, %41* ()*, %41* ()*, i8 ()*, void ()* }* @zz to %41*)
+}
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/multi-merge.ll b/llvm/test/Transforms/MergeSimilarFunc/multi-merge.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/multi-merge.ll
@@ -0,0 +1,124 @@
+; RUN: opt -mergesimilarfunc -mergesimilarfunc-level=all < %s -S | FileCheck %s
+
+; CHECK: define i32 @A
+; CHECK: define internal i32 @A__merged
+; CHECK: define i32 @B
+; CHECK: call i32 @A__merged
+; CHECK: define i32 @C
+; CHECK: call i32 @A__merged
+; CHECK: define i32 @D
+; CHECK: call i32 @A__merged
+
+define i32 @A(i32* %a, i32 %n, i32 %c) nounwind {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %mul = mul i32 %n, 4
+  %call = tail call i8* @memset1(i8* %0, i32 %c, i32 %mul) nounwind
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %a, %entry ]
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %1 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %1, %sum.08
+  %inc = add nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = add i32 %add, 1
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 1, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+declare i8* @memset1(i8*, i32, i32)
+
+define i32 @B(i32* %a, i32 %n, i32 %c) nounwind {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %mul = mul i32 %n, 4
+  %call = tail call i8* @memset1(i8* %0, i32 %c, i32 %mul) nounwind
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %a, %entry ]
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %1 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %1, %sum.08
+  %inc = add nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = add i32 %add, 2
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 2, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @C(i32* %a, i32 %n, i32 %c) nounwind {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %mul = mul i32 %n, 4
+  %call = tail call i8* @memset1(i8* %0, i32 %c, i32 %mul) nounwind
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %a, %entry ]
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.08 = phi i32 [ %add, %for.body ], [ 1, %entry ]
+  %1 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %1, %sum.08
+  %inc = add nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = add i32 %add, 3
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 4, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @D(i32* %a, i32 %n, i32 %c) nounwind {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %mul = mul i32 %n, 4
+  %call = tail call i8* @memset1(i8* %0, i32 %c, i32 %mul) nounwind
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %a, %entry ]
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.08 = phi i32 [ %add, %for.body ], [ 1, %entry ]
+  %1 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %1, %sum.08
+  %inc = add nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = add i32 %add, 4
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 5, %entry ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/llvm/test/Transforms/MergeSimilarFunc/self-ref.ll b/llvm/test/Transforms/MergeSimilarFunc/self-ref.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/self-ref.ll
@@ -0,0 +1,41 @@
+; RUN: opt -mergesimilarfunc -mergesimilarfunc-level=all -mergesimilarfunc-min-insts=0 -mergesimilarfunc-diff-min-insts=0 < %s -S | FileCheck %s
+
+; This test checks if we can tell the difference between the two corresponding
+; calls to foobar.  Each call has a  reference to the containing function as the
+; only argument to foobar.
+; If we cannot tell the difference between these two calls, then the functions
+; foo and bar below will be considered identical and bar will simply be patched
+; up to call foo. However, these functions are not identical as the calls to foobar
+; are themselves different. So, these functions are similar, but not identical.
+; bar should be patched up to call foo__merged which has code to deal with the
+; two different calls to foobar.
+
+; CHECK: define i32 @bar(i32 %a) #0 {
+; CHECK: %1 = tail call i32 @foo__merged(i32 %a, i32 1)
+define i32 @foo(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %call = call i32 @foobar(i32 (i32)* @foo)
+  store i32 %call, i32* %b, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+declare i32 @foobar(i32 (i32)*)
+
+define i32 @bar(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %call = call i32 @foobar(i32 (i32)* @bar)
+  store i32 %call, i32* %b, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
diff --git a/llvm/test/Transforms/MergeSimilarFunc/strong-weak.ll b/llvm/test/Transforms/MergeSimilarFunc/strong-weak.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/strong-weak.ll
@@ -0,0 +1,38 @@
+; RUN: opt -mergesimilarfunc -mergesimilarfunc-level=all -mergesimilarfunc-min-insts=0 -mergesimilarfunc-diff-min-insts=1 < %s -S
+; REQUIRES: asserts
+; CHECK-NOT: __merged
+define weak i32 @bar(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  %d = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %1 = load i32, i32* %d, align 4
+  %add = add nsw i32 %0, %1
+  %call = call i32 bitcast (i32 (...)* @foo to i32 ()*)()
+  %add1 = add nsw i32 %add, %call
+  store i32 %add1, i32* %b, align 4
+  %2 = load i32, i32* %b, align 4
+  ret i32 %2
+}
+
+declare i32 @foo(...)
+
+define i32 @barred(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  %d = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %1 = load i32, i32* %d, align 4
+  %add = add nsw i32 %0, %1
+  %call = call i32 bitcast (i32 (...)* @boohoo to i32 ()*)()
+  %add1 = add nsw i32 %add, %call
+  store i32 %add1, i32* %b, align 4
+  %2 = load i32, i32* %b, align 4
+  ret i32 %2
+}
+
+declare i32 @boohoo(...)
diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp
--- a/llvm/unittests/Transforms/Utils/CloningTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp
@@ -176,7 +176,7 @@
   ValueToValueMapTy VMap;
   VMap[A] = UndefValue::get(A->getType());
 
-  CloneFunctionInto(F2, F1, VMap, false, Returns);
+  CloneFunctionInto(F2, F1, VMap, CloneType::NoModuleLevelChanges, Returns);
   EXPECT_FALSE(F2->arg_begin()->hasNoCaptureAttr());
 
   delete F1;
@@ -199,7 +199,7 @@
   ValueToValueMapTy VMap;
   VMap[&*F1->arg_begin()] = &*F2->arg_begin();
 
-  CloneFunctionInto(F2, F1, VMap, false, Returns);
+  CloneFunctionInto(F2, F1, VMap, CloneType::NoModuleLevelChanges, Returns);
   EXPECT_EQ(CallingConv::Cold, F2->getCallingConv());
 
   delete F1;
@@ -700,7 +700,8 @@
   VMap[ImplFunction] = DeclFunction;
   // No args to map
   SmallVector<ReturnInst*, 8> Returns;
-  CloneFunctionInto(DeclFunction, ImplFunction, VMap, true, Returns);
+  CloneFunctionInto(DeclFunction, ImplFunction, VMap,
+                    CloneType::ModuleLevelChanges, Returns);
 
   EXPECT_FALSE(verifyModule(*ImplModule, &errs()));
   EXPECT_FALSE(verifyModule(*DeclModule, &errs()));