diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp --- a/clang/unittests/Tooling/Syntax/TreeTest.cpp +++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp @@ -121,6 +121,16 @@ return Root; } + void expectTreeDumpEqual(StringRef code, StringRef tree) { + SCOPED_TRACE(code); + + auto *Root = buildTree(code); + std::string Expected = tree.trim().str(); + std::string Actual = + std::string(llvm::StringRef(Root->dump(*Arena)).trim()); + EXPECT_EQ(Expected, Actual) << "the resulting dump is:\n" << Actual; + } + // Adds a file to the test VFS. void addFile(llvm::StringRef Path, llvm::StringRef Contents) { if (!FS->addFile(Path, time_t(), @@ -164,14 +174,13 @@ std::unique_ptr Arena; }; -TEST_F(SyntaxTreeTest, Basic) { - std::pair Cases[] = { - { - R"cpp( +TEST_F(SyntaxTreeTest, Simple) { + expectTreeDumpEqual( + R"cpp( int main() {} void foo() {} )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -193,16 +202,18 @@ `-CompoundStatement |-{ `-} -)txt"}, - // if. - { - R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, If) { + expectTreeDumpEqual( + R"cpp( int main() { if (true) {} if (true) {} else if (false) {} } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-int @@ -242,14 +253,17 @@ | |-{ | `-} `-} - )txt"}, - // for. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, For) { + expectTreeDumpEqual( + R"cpp( void test() { for (;;) {} } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -270,10 +284,18 @@ | |-{ | `-} `-} - )txt"}, - // declaration statement. - {"void test() { int a = 10; }", - R"txt( + )txt"); +} + +TEST_F(SyntaxTreeTest, RangeBasedFor) { + expectTreeDumpEqual( + R"cpp( +void test() { + int a[3]; + for (int x : a) ; +} + )cpp", + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -289,13 +311,32 @@ | | |-int | | `-SimpleDeclarator | | |-a - | | |-= - | | `-UnknownExpression - | | `-10 + | | `-ArraySubscript + | | |-[ + | | |-UnknownExpression + | | | `-3 + | | `-] | `-; + |-RangeBasedForStatement + | |-for + | |-( + | |-SimpleDeclaration + | | |-int + | | |-SimpleDeclarator + | | | `-x + | | `-: + | |-UnknownExpression + | | `-a + | |-) + | `-EmptyStatement + | `-; `-} -)txt"}, - {"void test() { ; }", R"txt( + )txt"); +} + +TEST_F(SyntaxTreeTest, DeclarationStatement) { + expectTreeDumpEqual("void test() { int a = 10; }", + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -306,12 +347,22 @@ | `-) `-CompoundStatement |-{ - |-EmptyStatement + |-DeclarationStatement + | |-SimpleDeclaration + | | |-int + | | `-SimpleDeclarator + | | |-a + | | |-= + | | `-UnknownExpression + | | `-10 | `-; `-} -)txt"}, - // switch, case and default. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, Switch) { + expectTreeDumpEqual( + R"cpp( void test() { switch (true) { case 0: @@ -319,7 +370,7 @@ } } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -350,14 +401,17 @@ | | `-; | `-} `-} -)txt"}, - // while. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, While) { + expectTreeDumpEqual( + R"cpp( void test() { while (true) { continue; break; } } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -384,77 +438,15 @@ | | `-; | `-} `-} -)txt"}, - // return. - {R"cpp( -int test() { return 1; } - )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-test - | `-ParametersAndQualifiers - | |-( - | `-) - `-CompoundStatement - |-{ - |-ReturnStatement - | |-return - | |-UnknownExpression - | | `-1 - | `-; - `-} -)txt"}, - // Range-based for. - {R"cpp( -void test() { - int a[3]; - for (int x : a) ; +)txt"); } - )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-test - | `-ParametersAndQualifiers - | |-( - | `-) - `-CompoundStatement - |-{ - |-DeclarationStatement - | |-SimpleDeclaration - | | |-int - | | `-SimpleDeclarator - | | |-a - | | `-ArraySubscript - | | |-[ - | | |-UnknownExpression - | | | `-3 - | | `-] - | `-; - |-RangeBasedForStatement - | |-for - | |-( - | |-SimpleDeclaration - | | |-int - | | |-SimpleDeclarator - | | | `-x - | | `-: - | |-UnknownExpression - | | `-a - | |-) - | `-EmptyStatement - | `-; - `-} - )txt"}, - // Unhandled statements should end up as 'unknown statement'. - // This example uses a 'label statement', which does not yet have a syntax - // counterpart. - {"void main() { foo: return 100; }", R"txt( + +TEST_F(SyntaxTreeTest, UnhandledStatement) { + // Unhandled statements should end up as 'unknown statement'. + // This example uses a 'label statement', which does not yet have a syntax + // counterpart. + expectTreeDumpEqual("void main() { foo: return 100; }", + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -474,16 +466,20 @@ | | `-100 | `-; `-} -)txt"}, - // expressions should be wrapped in 'ExpressionStatement' when they appear - // in a statement position. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, Expressions) { + // expressions should be wrapped in 'ExpressionStatement' when they appear + // in a statement position. + expectTreeDumpEqual( + R"cpp( void test() { test(); if (true) test(); else test(); } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -520,12 +516,15 @@ | | `-) | `-; `-} -)txt"}, - // Multiple declarators group into a single SimpleDeclaration. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, MultipleDeclaratorsGrouping) { + expectTreeDumpEqual( + R"cpp( int *a, b; )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-int @@ -536,11 +535,12 @@ |-SimpleDeclarator | `-b `-; - )txt"}, - {R"cpp( + )txt"); + expectTreeDumpEqual( + R"cpp( typedef int *a, b; )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-typedef @@ -552,15 +552,18 @@ |-SimpleDeclarator | `-b `-; - )txt"}, - // Multiple declarators inside a statement. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, MultipleDeclaratorsInsideStatement) { + expectTreeDumpEqual( + R"cpp( void foo() { int *a, b; typedef int *ta, tb; } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -593,15 +596,19 @@ | | `-tb | `-; `-} - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, Namespaces) { + expectTreeDumpEqual( + R"cpp( namespace a { namespace b {} } namespace a::b {} namespace {} namespace foo = a; )cpp", - R"txt( + R"txt( *: TranslationUnit |-NamespaceDefinition | |-namespace @@ -630,9 +637,62 @@ |-= |-a `-; -)txt"}, - // Free-standing classes, must live inside a SimpleDeclaration. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, UsingDirective) { + expectTreeDumpEqual( + R"cpp( +namespace ns {} +using namespace ::ns; + )cpp", + R"txt( +*: TranslationUnit +|-NamespaceDefinition +| |-namespace +| |-ns +| |-{ +| `-} +`-UsingNamespaceDirective + |-using + |-namespace + |-:: + |-ns + `-; + )txt"); +} + +TEST_F(SyntaxTreeTest, UsingDeclaration) { + expectTreeDumpEqual( + R"cpp( +namespace ns { int a; } +using ns::a; + )cpp", + R"txt( +*: TranslationUnit +|-NamespaceDefinition +| |-namespace +| |-ns +| |-{ +| |-SimpleDeclaration +| | |-int +| | |-SimpleDeclarator +| | | `-a +| | `-; +| `-} +`-UsingDeclaration + |-using + |-ns + |-:: + |-a + `-; + )txt"); +} + +TEST_F(SyntaxTreeTest, FreeStandingClasses) { + // Free-standing classes, must live inside a SimpleDeclaration. + expectTreeDumpEqual( + R"cpp( sturct X; struct X {}; @@ -641,7 +701,7 @@ struct {} *a1; )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-sturct @@ -677,13 +737,17 @@ | |-* | `-a1 `-; -)txt"}, - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, Templates) { + expectTreeDumpEqual( + R"cpp( template struct cls {}; template int var = 10; template int fun() {} )cpp", - R"txt( + R"txt( *: TranslationUnit |-TemplateDeclaration | |-template @@ -730,15 +794,19 @@ `-CompoundStatement |-{ `-} -)txt"}, - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, NestedTemplates) { + expectTreeDumpEqual( + R"cpp( template struct X { template U foo(); }; )cpp", - R"txt( + R"txt( *: TranslationUnit `-TemplateDeclaration |-template @@ -768,85 +836,16 @@ | `-; |-} `-; -)txt"}, - {R"cpp( -template struct X {}; -template struct X {}; -template <> struct X {}; +)txt"); +} -template struct X; -extern template struct X; -)cpp", - R"txt( -*: TranslationUnit -|-TemplateDeclaration -| |-template -| |-< -| |-UnknownDeclaration -| | |-class -| | `-T -| |-> -| `-SimpleDeclaration -| |-struct -| |-X -| |-{ -| |-} -| `-; -|-TemplateDeclaration -| |-template -| |-< -| |-UnknownDeclaration -| | |-class -| | `-T -| |-> -| `-SimpleDeclaration -| |-struct -| |-X -| |-< -| |-T -| |-* -| |-> -| |-{ -| |-} -| `-; -|-TemplateDeclaration -| |-template -| |-< -| |-> -| `-SimpleDeclaration -| |-struct -| |-X -| |-< -| |-int -| |-> -| |-{ -| |-} -| `-; -|-ExplicitTemplateInstantiation -| |-template -| `-SimpleDeclaration -| |-struct -| |-X -| |-< -| |-double -| |-> -| `-; -`-ExplicitTemplateInstantiation - |-extern - |-template - `-SimpleDeclaration - |-struct - |-X - |-< - |-float - |-> - `-; -)txt"}, - {R"cpp( +TEST_F(SyntaxTreeTest, Templates2) { + expectTreeDumpEqual( + R"cpp( template struct X { struct Y; }; template struct X::Y {}; )cpp", - R"txt( + R"txt( *: TranslationUnit |-TemplateDeclaration | |-template @@ -883,55 +882,18 @@ |-{ |-} `-; - )txt"}, - {R"cpp( -namespace ns {} -using namespace ::ns; - )cpp", - R"txt( -*: TranslationUnit -|-NamespaceDefinition -| |-namespace -| |-ns -| |-{ -| `-} -`-UsingNamespaceDirective - |-using - |-namespace - |-:: - |-ns - `-; - )txt"}, - {R"cpp( -namespace ns { int a; } -using ns::a; - )cpp", - R"txt( -*: TranslationUnit -|-NamespaceDefinition -| |-namespace -| |-ns -| |-{ -| |-SimpleDeclaration -| | |-int -| | |-SimpleDeclarator -| | | `-a -| | `-; -| `-} -`-UsingDeclaration - |-using - |-ns - |-:: - |-a - `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, TemplatesUsingUsing) { + expectTreeDumpEqual( + R"cpp( template struct X { using T::foo; using typename T::bar; }; )cpp", - R"txt( + R"txt( *: TranslationUnit `-TemplateDeclaration |-template @@ -959,11 +921,92 @@ | `-; |-} `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ExplicitTemplateInstantations) { + expectTreeDumpEqual( + R"cpp( +template struct X {}; +template struct X {}; +template <> struct X {}; + +template struct X; +extern template struct X; +)cpp", + R"txt( +*: TranslationUnit +|-TemplateDeclaration +| |-template +| |-< +| |-UnknownDeclaration +| | |-class +| | `-T +| |-> +| `-SimpleDeclaration +| |-struct +| |-X +| |-{ +| |-} +| `-; +|-TemplateDeclaration +| |-template +| |-< +| |-UnknownDeclaration +| | |-class +| | `-T +| |-> +| `-SimpleDeclaration +| |-struct +| |-X +| |-< +| |-T +| |-* +| |-> +| |-{ +| |-} +| `-; +|-TemplateDeclaration +| |-template +| |-< +| |-> +| `-SimpleDeclaration +| |-struct +| |-X +| |-< +| |-int +| |-> +| |-{ +| |-} +| `-; +|-ExplicitTemplateInstantiation +| |-template +| `-SimpleDeclaration +| |-struct +| |-X +| |-< +| |-double +| |-> +| `-; +`-ExplicitTemplateInstantiation + |-extern + |-template + `-SimpleDeclaration + |-struct + |-X + |-< + |-float + |-> + `-; +)txt"); +} + +TEST_F(SyntaxTreeTest, UsingType) { + expectTreeDumpEqual( + R"cpp( using type = int; )cpp", - R"txt( + R"txt( *: TranslationUnit `-TypeAliasDeclaration |-using @@ -971,20 +1014,28 @@ |-= |-int `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, EmptyDeclaration) { + expectTreeDumpEqual( + R"cpp( ; )cpp", - R"txt( + R"txt( *: TranslationUnit `-EmptyDeclaration `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, StaticAssert) { + expectTreeDumpEqual( + R"cpp( static_assert(true, "message"); static_assert(true); )cpp", - R"txt( + R"txt( *: TranslationUnit |-StaticAssertDeclaration | |-static_assert @@ -1003,12 +1054,16 @@ | `-true |-) `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ExternC) { + expectTreeDumpEqual( + R"cpp( extern "C" int a; extern "C" { int b; int c; } )cpp", - R"txt( + R"txt( *: TranslationUnit |-LinkageSpecificationDeclaration | |-extern @@ -1033,15 +1088,19 @@ | | `-c | `-; `-} - )txt"}, - // Some nodes are non-modifiable, they are marked with 'I:'. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, NonModifiableNodes) { + // Some nodes are non-modifiable, they are marked with 'I:'. + expectTreeDumpEqual( + R"cpp( #define HALF_IF if (1+ #define HALF_IF_2 1) {} void test() { HALF_IF HALF_IF_2 else {} })cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -1068,9 +1127,10 @@ | |-{ | `-} `-} - )txt"}, - // All nodes can be mutated. - {R"cpp( + )txt"); + // All nodes can be mutated. + expectTreeDumpEqual( + R"cpp( #define OPEN { #define CLOSE } @@ -1084,7 +1144,7 @@ } } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -1110,15 +1170,18 @@ | | `-; | `-} `-} - )txt"}, - // Array subscripts in declarators. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ArraySubscriptsInDeclarators) { + expectTreeDumpEqual( + R"cpp( int a[10]; int b[1][2][3]; int c[] = {1,2,3}; void f(int xs[static 10]); )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -1185,9 +1248,12 @@ | | `-] | `-) `-; - )txt"}, - // Parameter lists in declarators. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ParameterListsInDeclarators) { + expectTreeDumpEqual( + R"cpp( int a() const; int b() volatile; int c() &; @@ -1202,7 +1268,7 @@ int&& f ); )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -1301,14 +1367,17 @@ | | `-f | `-) `-; - )txt"}, - // Trailing const qualifier. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, TrailingConst) { + expectTreeDumpEqual( + R"cpp( struct X { int foo() const; } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-struct @@ -1324,12 +1393,15 @@ | | `-const | `-; `-} - )txt"}, - // Trailing return type in parameter lists. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, TrailingReturn) { + expectTreeDumpEqual( + R"cpp( auto foo() -> int; )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-auto @@ -1342,14 +1414,17 @@ | |--> | `-int `-; - )txt"}, - // Exception specification in parameter lists. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ExceptionSpecification) { + expectTreeDumpEqual( + R"cpp( int a() noexcept; int b() noexcept(true); int c() throw(); )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -1384,15 +1459,18 @@ | |-( | `-) `-; - )txt"}, - // Declarators in parentheses. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, DeclaratorsInParentheses) { + expectTreeDumpEqual( + R"cpp( int (a); int *(b); int (*c)(int); int *(d)(int); )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -1439,15 +1517,18 @@ | | `-int | `-) `-; - )txt"}, - // CV qualifiers. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ConstVolatileQualifiers) { + expectTreeDumpEqual( + R"cpp( const int west = -1; int const east = 1; const int const universal = 0; const int const *const *volatile b; )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-const @@ -1489,12 +1570,15 @@ | |-volatile | `-b `-; - )txt"}, - // Ranges of declarators with trailing return types. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, RangesOfDeclaratorsWithTrailingReturnTypes) { + expectTreeDumpEqual( + R"cpp( auto foo() -> auto(*)(int) -> double*; )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-auto @@ -1522,14 +1606,17 @@ | `-SimpleDeclarator | `-* `-; - )txt"}, - // Member pointers. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, MemberPointers) { + expectTreeDumpEqual( + R"cpp( struct X {}; int X::* a; const int X::* b; )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-struct @@ -1556,12 +1643,15 @@ | | `-* | `-b `-; - )txt"}, - // All-in-one tests. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ComplexDeclarator) { + expectTreeDumpEqual( + R"cpp( void x(char a, short (*b)(int)); )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -1589,11 +1679,15 @@ | | `-) | `-) `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ComplexDeclarator2) { + expectTreeDumpEqual( + R"cpp( void x(char a, short (*b)(int), long (**c)(long long)); )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -1637,18 +1731,7 @@ | | `-) | `-) `-; - )txt"}, - }; - - for (const auto &T : Cases) { - SCOPED_TRACE(T.first); - - auto *Root = buildTree(T.first); - std::string Expected = llvm::StringRef(T.second).trim().str(); - std::string Actual = - std::string(llvm::StringRef(Root->dump(*Arena)).trim()); - EXPECT_EQ(Expected, Actual) << "the resulting dump is:\n" << Actual; - } + )txt"); } TEST_F(SyntaxTreeTest, Mutations) { diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -1108,6 +1108,85 @@ return CallGraphRoot; } + bool hasSimilarFunction(unsigned id) { return SimilarFunctions.count(id); } + + void addToSimilarFunctions(unsigned id, GlobalValue::GUID GUID) { + if (id == 0) + return; // invalid id. + if (DuplicateFunctions.count(GUID)) + return; + ValueInfo VI = getValueInfo(GUID); + if (!VI) + return; + // assert(VI.getSummaryList().size() == 1); + if (VI.getSummaryList().size() != 1) + return; + GlobalValueSummary *S = VI.getSummaryList()[0].get(); + if (!isa(S)) + return; + assert(isa(S) && "Not a function summary!"); + if (FunctionSimilarityHashes.count(GUID)) { + // Erase the GUID having multiple visits in the ModuleSummaryIndex. + FunctionSimilarityHashes.erase(GUID); + DuplicateFunctions.insert(GUID); + return; + } + + FunctionSimilarityHashes[GUID] = id; + } + + void populateReverseSimilarityHashMap() { + for (auto &p : FunctionSimilarityHashes) + SimilarFunctions[p.second].push_back(p.first); + } + + void removeSingleEntriesFromSimHashMaps() { + // Iterate over the hash to remove entries with no duplicates. + for (auto I = SimilarFunctions.begin(), E = SimilarFunctions.end(); + I != E;) { + auto Next = std::next(I); + assert(I->second.size() && "Empty Entry!"); + if (I->second.size() == 1) { + FunctionSimilarityHashes.erase(I->second[0]); + SimilarFunctions.erase(I); + } + I = Next; + } + } + + std::map> &getSimilarFunctions() { + return SimilarFunctions; + } + + const std::map> & + getSimilarFunctions() const { + return SimilarFunctions; + } + + unsigned getSimilarityHash(GlobalValue::GUID ID) const { + return FunctionSimilarityHashes.find(ID)->second; + } + + std::map &getSimilarFunctionsHash() { + return FunctionSimilarityHashes; + } + + const std::map &getSimilarFunctionsHash() const { + return FunctionSimilarityHashes; + } + + void addToHostSimilarFunction(GlobalValue::GUID ID) { + HostSimilarFunction.insert(ID); + } + + std::set &getHostSimilarFunction() { + return HostSimilarFunction; + } + + const std::set &getHostSimilarFunction() const { + return HostSimilarFunction; + } + bool withGlobalValueDeadStripping() const { return WithGlobalValueDeadStripping; } diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -292,6 +292,8 @@ void initializeMemorySSAWrapperPassPass(PassRegistry&); void initializeMemorySanitizerLegacyPassPass(PassRegistry&); void initializeMergeFunctionsLegacyPassPass(PassRegistry&); +void initializeMergeFunctionsPass(PassRegistry&); +void initializeMergeSimilarFunctionsPass(PassRegistry&); void initializeMergeICmpsLegacyPassPass(PassRegistry &); void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&); void initializeMetaRenamerPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -197,6 +197,7 @@ (void) llvm::createPostOrderFunctionAttrsLegacyPass(); (void) llvm::createReversePostOrderFunctionAttrsPass(); (void) llvm::createMergeFunctionsPass(); + (void) llvm::createMergeSimilarFunctionsPass(); (void) llvm::createMergeICmpsLegacyPass(); (void) llvm::createExpandMemCmpPass(); std::string buf; diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h --- a/llvm/include/llvm/Transforms/IPO.h +++ b/llvm/include/llvm/Transforms/IPO.h @@ -215,6 +215,13 @@ /// function(s). ModulePass *createHotColdSplittingPass(); +//===----------------------------------------------------------------------===// +/// createMergeSimilarFunctionsPass - This pass discovers similar functions and +/// merges them. +/// +ModulePass * +createMergeSimilarFunctionsPass(const ModuleSummaryIndex *S = nullptr); + //===----------------------------------------------------------------------===// /// createPartialInliningPass - This pass inlines parts of functions. /// diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h --- a/llvm/include/llvm/Transforms/Utils/Cloning.h +++ b/llvm/include/llvm/Transforms/Utils/Cloning.h @@ -126,6 +126,20 @@ Function *CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo = nullptr); +/// Used to control @fn CloneFunctionInto. +enum class CloneType { + InvalidCloneType, + /// Cloning will result in module level changes. + ModuleLevelChanges, + /// !ModuleLevelChanges, When no module level changes will be made to the + /// cloned function. + NoModuleLevelChanges, + /// Cloning will be used for extracting functions by passes like function + /// merging, it does not require module level changes but debug info needs + /// special treatment like: DISubprogram is not cloned. + ExtractingFunctions, +}; + /// Clone OldFunc into NewFunc, transforming the old arguments into references /// to VMap values. Note that if NewFunc already has basic blocks, the ones /// cloned into it will be added to the end of the function. This function @@ -136,7 +150,7 @@ /// mappings. /// void CloneFunctionInto(Function *NewFunc, const Function *OldFunc, - ValueToValueMapTy &VMap, bool ModuleLevelChanges, + ValueToValueMapTy &VMap, CloneType CT, SmallVectorImpl &Returns, const char *NameSuffix = "", ClonedCodeInfo *CodeInfo = nullptr, diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -76,6 +76,89 @@ cl::value_desc("filename"), cl::desc("File to emit dot graph of new summary into.")); +cl::opt UseGlobalAliases( + "mergesimilarfunc-global-aliases", cl::Hidden, cl::init(false), + cl::desc("Enable writing alias by enabling global aliases")); + +cl::opt MergeMinInsts( + "mergesimilarfunc-min-insts", cl::Hidden, cl::init(4), + cl::desc("Min instructions required to even consider single block fns")); + +// Minimize the name pollution caused by the enum values. +namespace Opt { +cl::opt MergeLevel( + "mergesimilarfunc-level", cl::Hidden, cl::ZeroOrMore, + cl::desc("Level of function merging:"), cl::init(size), + cl::values(clEnumVal(none, "function merging disabled"), + clEnumVal(size, "only try to merge functions that are optimized " + "for size"), + clEnumVal(all, "attempt to merge all similar functions"))); +} + +namespace llvm { + +static const char *MERGED_SUFFIX = "__merged"; + +/// Returns the type id for a type to be hashed. We turn pointer types into +/// integers here because the actual compare logic below considers pointers and +/// integers of the same size as equal. +static Type::TypeID getTypeIDForHash(Type *Ty) { + if (Ty->isPointerTy()) + return Type::IntegerTyID; + return Ty->getTypeID(); +} + +bool isAliasCapable(const Function* G) { + return + UseGlobalAliases && G->hasGlobalUnnamedAddr() + && (G->hasExternalLinkage() || G->hasLocalLinkage() || G->hasWeakLinkage()); +} + +bool isComparisonCandidate(const Function *F) { + if (Opt::MergeLevel == Opt::size) { + // Only consider functions that are to be optimized for size. + // By default, that is all functions at -Os/-Oz and nothing at -O2. + bool Os = F->getAttributes(). + hasAttribute(AttributeList::FunctionIndex, Attribute::OptimizeForSize); + bool Oz = F->getAttributes(). + hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); + if (!Os && !Oz) + return false; + } + + // Ignore declarations and tiny functions - no point in merging those + if (F->isDeclaration()) return false; + if (F->getName().endswith(MERGED_SUFFIX)) return false; + if (F->hasAvailableExternallyLinkage()) return false; + if (F->hasFnAttribute(Attribute::AlwaysInline)) return false; + if (F->size() == 1 && F->begin()->size() < MergeMinInsts) + return isAliasCapable(F); + + return true; +} + +unsigned profileFunction(const Function *F) { + FunctionType *FTy = F->getFunctionType(); + if (!isComparisonCandidate(F)) + return 0; + if (F->hasGC() || FTy->isVarArg() || !F->hasExactDefinition()) + return 0; + FoldingSetNodeID ID; + ID.AddInteger(F->size()); + ID.AddInteger(F->getCallingConv()); + // Add pure attribute, has side-effects attribute. + ID.AddBoolean(F->hasFnAttribute(Attribute::NoUnwind)); + ID.AddBoolean(F->hasFnAttribute(Attribute::NoReturn)); + //ID.AddBoolean(F->hasGC()); + //ID.AddBoolean(F->isInterposable()); + //ID.AddBoolean(FTy->isVarArg()); + ID.AddInteger(getTypeIDForHash(FTy->getReturnType())); + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) + ID.AddInteger(getTypeIDForHash(FTy->getParamType(i))); + return ID.ComputeHash(); +} +} + // Walk through the operands of a given User via worklist iteration and populate // the set of GlobalValue references encountered. Invoked either on an // Instruction or a GlobalVariable (which walks its initializer). @@ -467,8 +550,7 @@ F.hasFnAttribute(Attribute::NoRecurse), F.returnDoesNotAlias(), // FIXME: refactor this to use the same code that inliner is using. // Don't try to import functions with noinline attribute. - F.getAttributes().hasFnAttribute(Attribute::NoInline), - F.hasFnAttribute(Attribute::AlwaysInline)}; + F.getAttributes().hasFnAttribute(Attribute::NoInline)}; auto FuncSummary = std::make_unique( Flags, NumInsts, FunFlags, /*EntryCount=*/0, std::move(Refs), CallGraphEdges.takeVector(), TypeTests.takeVector(), diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -5785,11 +5785,9 @@ } const uint64_t Version = Record[0]; const bool IsOldProfileFormat = Version == 1; - if (Version < 1 || Version > ModuleSummaryIndex::BitcodeSummaryVersion) + if (Version < 1 || Version > 7) return error("Invalid summary version " + Twine(Version) + - ". Version should be in the range [1-" + - Twine(ModuleSummaryIndex::BitcodeSummaryVersion) + - "]."); + ". Version should be in the range [1-7]."); Record.clear(); // Keep around the last seen summary to be used when we see an optional @@ -5904,6 +5902,11 @@ std::move(PendingTypeCheckedLoadVCalls), std::move(PendingTypeTestAssumeConstVCalls), std::move(PendingTypeCheckedLoadConstVCalls)); + PendingTypeTests.clear(); + PendingTypeTestAssumeVCalls.clear(); + PendingTypeCheckedLoadVCalls.clear(); + PendingTypeTestAssumeConstVCalls.clear(); + PendingTypeCheckedLoadConstVCalls.clear(); auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID); FS->setModulePath(getThisModule()->first()); FS->setOriginalName(VIAndOriginalGUID.second); @@ -6046,6 +6049,11 @@ std::move(PendingTypeCheckedLoadVCalls), std::move(PendingTypeTestAssumeConstVCalls), std::move(PendingTypeCheckedLoadConstVCalls)); + PendingTypeTests.clear(); + PendingTypeTestAssumeVCalls.clear(); + PendingTypeCheckedLoadVCalls.clear(); + PendingTypeTestAssumeConstVCalls.clear(); + PendingTypeCheckedLoadConstVCalls.clear(); LastSeenSummary = FS.get(); LastSeenGUID = VI.getGUID(); FS->setModulePath(ModuleIdMap[ModuleId]); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -3737,6 +3737,11 @@ NameVals.clear(); } +// Current version for the summary. +// This is bumped whenever we introduce changes in the way some record are +// interpreted, like flags for instance. +static const uint64_t INDEX_VERSION = 7; + /// Emit the per-module summary section alongside the rest of /// the module's bitcode. void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp --- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp @@ -318,7 +318,7 @@ "modules."); SmallVector Returns; // Ignore returns cloned. - CloneFunctionInto(NewF, &OrigF, VMap, /*ModuleLevelChanges=*/true, Returns, + CloneFunctionInto(NewF, &OrigF, VMap, CloneType::ModuleLevelChanges, Returns, "", nullptr, nullptr, Materializer); OrigF.deleteBody(); } diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp --- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp +++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp @@ -316,7 +316,7 @@ } } SmallVector Returns; - CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns); + CloneFunctionInto(NewF, F, VMap, CloneType::NoModuleLevelChanges, Returns); // Build new MDNode. SmallVector KernelMDArgs; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1141,8 +1141,8 @@ .addReg(FrameReg); } else { if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { - Register ScaledReg = - RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0); + // Reuse ResultReg in intermediate step. + Register ScaledReg = ResultReg; BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -216,11 +216,16 @@ // VGPR registers foreach Index = 0-255 in { + // Set a cost value for vgprs other than the argument registers (v0-v31). + // The ratio of index/allocation_granularity is taken as the cost value. + // Considered the allocation granularity as 4 here. + let CostPerUse=!if(!gt(Index, 31), !srl(Index, 2), 0) in { def VGPR#Index : SIReg <"v"#Index, Index>, DwarfRegNum<[!add(Index, 2560)]> { let HWEncoding{8} = 1; } + } } // AccVGPR registers diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -667,7 +667,7 @@ auto savedLinkage = NewF->getLinkage(); NewF->setLinkage(llvm::GlobalValue::ExternalLinkage); - CloneFunctionInto(NewF, &OrigF, VMap, /*ModuleLevelChanges=*/true, Returns); + CloneFunctionInto(NewF, &OrigF, VMap, CloneType::ModuleLevelChanges, Returns); NewF->setLinkage(savedLinkage); NewF->setVisibility(savedVisibility); diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -27,6 +27,7 @@ LowerTypeTests.cpp MergeFunctions.cpp OpenMPOpt.cpp + MergeSimilarFunctions.cpp PartialInlining.cpp PassManagerBuilder.cpp PruneEH.cpp diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp --- a/llvm/lib/Transforms/IPO/IPO.cpp +++ b/llvm/lib/Transforms/IPO/IPO.cpp @@ -45,6 +45,8 @@ initializeSingleLoopExtractorPass(Registry); initializeLowerTypeTestsPass(Registry); initializeMergeFunctionsLegacyPassPass(Registry); + initializeMergeFunctionsPass(Registry); + initializeMergeSimilarFunctionsPass(Registry); initializePartialInlinerLegacyPassPass(Registry); initializeAttributorLegacyPassPass(Registry); initializeAttributorCGSCCLegacyPassPass(Registry); diff --git a/llvm/lib/Transforms/IPO/MergeSimilarFunctions.cpp b/llvm/lib/Transforms/IPO/MergeSimilarFunctions.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/IPO/MergeSimilarFunctions.cpp @@ -0,0 +1,2197 @@ +//===- MergeSimilarFunctions.cpp - Merge similar functions ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass merges both equivalent and similar functions to reduce code size. +// +// For a more detailed explanation of the approach, see: +// Edler von Koch et al. "Exploiting Function Similarity for Code Size +// Reduction", LCTES 2014. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mergesimilarfunc" +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include +using namespace llvm; + +STATISTIC(NumFunctionsMerged, "Number of functions merged"); +STATISTIC(NumThunksWritten, "Number of thunks generated"); +STATISTIC(NumAliasesWritten, "Number of aliases generated"); +STATISTIC(NumDoubleWeak, "Number of new functions created"); +STATISTIC(NumMultiMerged, "Number of multi-merged functions"); + +STATISTIC(NumSimilarFunctionsMerged, "Number of similar functions merged"); + +static cl::opt MergeMinInsts( + "mergesimilarfunc-min-insts", cl::Hidden, cl::init(4), + cl::desc("Min instructions required to even consider single block fns")); + +static cl::opt MergeDifferingMinInsts( + "mergesimilarfunc-diff-min-insts", cl::Hidden, cl::init(15), + cl::desc("Min instructions required to try merging differing functions")); + +static cl::opt MergeMaxDiffering( + "mergesimilarfunc-max-diff", cl::Hidden, cl::init(8), + cl::desc("Maximum number of differing instructions allowed")); + +static cl::opt MergeMinSimilarity( + "mergesimilarfunc-min-similarity", cl::Hidden, cl::init(70), + cl::desc("Minimum percentage of similar instructions required")); + +static cl::opt OptPrintMerges("mergesimilarfunc-print-merges", cl::Hidden, + cl::init(false)); + +static cl::opt UseGlobalAliases( + "mergesimilarfunc-global-aliases", cl::Hidden, cl::init(false), + cl::desc("Enable writing alias by enabling global aliases")); + +void PrintMerges(const char *Desc, Function *Old, Function *New) { + if (OptPrintMerges) { + dbgs() << "=== [" << Desc << "] replacing " << Old->getName() << " with " + << New->getName() << "\n"; + } +} + +// Minimize the name pollution caused by the enum values. +namespace Opt { +enum MergeLevelEnum { none, size, all }; +static cl::opt MergeLevel( + "mergesimilarfunc-level", cl::Hidden, cl::ZeroOrMore, + cl::desc("Level of function merging:"), cl::init(size), + cl::values(clEnumVal(none, "function merging disabled"), + clEnumVal(size, "only try to merge functions that are optimized " + "for size"), + clEnumVal(all, "attempt to merge all similar functions"))); +} + +static const char *MERGED_SUFFIX = "__merged"; + +/// Returns the type id for a type to be hashed. We turn pointer types into +/// integers here because the actual compare logic below considers pointers and +/// integers of the same size as equal. +static Type::TypeID getTypeIDForHash(Type *Ty) { + if (Ty->isPointerTy()) + return Type::IntegerTyID; + return Ty->getTypeID(); +} + +/// Creates a hash-code for the function which is the same for any two +/// functions that will compare equal, without looking at the instructions +/// inside the function. +static unsigned profileFunction(const Function *F) { + FunctionType *FTy = F->getFunctionType(); + + FoldingSetNodeID ID; + ID.AddInteger(F->size()); + ID.AddInteger(F->getCallingConv()); + ID.AddBoolean(F->hasGC()); + ID.AddBoolean(F->isInterposable()); + ID.AddBoolean(FTy->isVarArg()); + ID.AddInteger(getTypeIDForHash(FTy->getReturnType())); + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) + ID.AddInteger(getTypeIDForHash(FTy->getParamType(i))); + return ID.ComputeHash(); +} + + +/// Replace Inst1 by a switch statement that executes Inst1 or one of Inst2s +/// depending on the value of SwitchVal. If a value in Inst2s is NULL, it +/// defaults to executing Inst1. Returns set of terminator instructions of newly +/// created switch blocks in Ret. +/// +/// For instance, the transformation may look as follows: +/// ...Head... +/// Inst1 with all of Insts2s without parents +/// ...Tail... +/// into +/// ...Head... +/// Switch +/// / | \ . +/// (default) (1) (2) +/// Inst1 Inst2s[0] Inst2s[1] +/// Ret[0] Ret[1] Ret[2] +/// \ | / +/// ...Tail... +/// +static void SplitBlockAndInsertSwitch( + Value *SwitchVal, Instruction *Inst1, + SmallVectorImpl &Inst2s, + SmallVectorImpl &Ret) { + // Split block + BasicBlock *Head = Inst1->getParent(); + BasicBlock *Tail = Head->splitBasicBlock(Inst1); + + // Create default block + LLVMContext &C = Head->getContext(); + BasicBlock *DefaultBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); + + // Insert switch instruction at end of Head + Instruction *HeadOldTerm = Head->getTerminator(); + SwitchInst *Switch = SwitchInst::Create(SwitchVal, DefaultBlock, + Inst2s.size()); + ReplaceInstWithInst(HeadOldTerm, Switch); + + // Move instructions into the blocks + if (Inst1->isTerminator()) { + Inst1->removeFromParent(); + DefaultBlock->getInstList().push_back(Inst1); + Ret.push_back(cast(Inst1)); + } else { + Instruction *DefaultTerm = BranchInst::Create(Tail, DefaultBlock); + Inst1->moveBefore(DefaultTerm); + Ret.push_back(DefaultTerm); + } + + for (unsigned InstPos = 0, InstNum = Inst2s.size(); InstPos < InstNum; + ++InstPos) { + Instruction *Inst2 = Inst2s[InstPos]; + if (!Inst2) { + Ret.push_back(NULL); + continue; + } + + BasicBlock *CaseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); + + // Update the debug information of the merged instruction by marking it as + // 'inlined' at this location. If only Inst1 or Inst2 has debug + // information, we try to do something sensible that won't break the + // verifier. + if (Inst1->getDebugLoc()) { + if (Inst2->getDebugLoc()) { + const DebugLoc &I2Loc = Inst2->getDebugLoc(); + Inst2->setDebugLoc( + DebugLoc::get(I2Loc.getLine(), I2Loc.getCol(), I2Loc.getScope(), + /*InlinedAt*/ Inst1->getDebugLoc().getAsMDNode())); + } else { + Inst2->setDebugLoc(Inst1->getDebugLoc()); + } + } else if (Inst2->getDebugLoc()) { + Inst2->setDebugLoc(DebugLoc()); + } + + if (Inst2->isTerminator()) { + assert(Inst1->isTerminator() && + "Inst1 and Inst2 must both be terminators or non-terminators!"); + CaseBlock->getInstList().push_back(Inst2); + Ret.push_back(cast(Inst2)); + } else { + Instruction *CaseTerm = BranchInst::Create(Tail, CaseBlock); + Inst2->insertBefore(CaseTerm); + Ret.push_back(CaseTerm); + } + + Switch->addCase(ConstantInt::get(cast(SwitchVal->getType()), + InstPos+1), + CaseBlock); + } + + // If Inst1 (and Inst2s) are Terminator Inst's, Tail will be empty and can be + // deleted now. We also need to update PHI nodes to add the additional + // incoming blocks from the SwitchInst. + if (Inst1->isTerminator()) { + for (succ_iterator I = succ_begin(DefaultBlock), E = succ_end(DefaultBlock); + I != E; ++I) { + BasicBlock *Successor = *I; + PHINode *Phi; + + for (BasicBlock::iterator II = Successor->begin(); + (Phi = dyn_cast(II)); ++II) + for (unsigned ValId = 0, ValEnd = Phi->getNumIncomingValues(); + ValId != ValEnd; ++ValId) + if (Phi->getIncomingBlock(ValId) == Tail) { + Phi->setIncomingBlock(ValId, DefaultBlock); + SmallVectorImpl::iterator + SwitchI = Ret.begin(), SwitchE = Ret.end(); + for (++SwitchI; SwitchI != SwitchE; ++SwitchI) { + if (!*SwitchI) + continue; + Phi->addIncoming(Phi->getIncomingValue(ValId), + (*SwitchI)->getParent()); + } + } + } + + Tail->eraseFromParent(); + } +} + +/// Insert function NewF into module, placing it immediately after the +/// existing function PredF. If PredF does not exist, insert at the end. +static void insertFunctionAfter(Function *NewF, Function *PredF) { + Module *M = PredF->getParent(); + Module::FunctionListType &FList = M->getFunctionList(); + + for (Module::FunctionListType::iterator I = FList.begin(), E = FList.end(); + I != E; ++I) { + if (PredF == &*I) { + FList.insertAfter(I, NewF); + return; + } + } + + // Couldn't find PredF, insert at end + FList.push_back(NewF); +} + +/// Create a cast instruction if needed to cast V to type DstType. We treat +/// pointer and integer types of the same bitwidth as equivalent, so this can be +/// used to cast them to each other where needed. The function returns the Value +/// itself if no cast is needed, or a new CastInst instance inserted before +/// InsertBefore. The integer type equivalent to pointers must be passed as +/// IntPtrType (get it from DataLayout). This is guaranteed to generate no-op +/// casts, otherwise it will assert. +static Value *createCastIfNeeded(Value *V, Type *DstType, + Value *InstrOrBB, Type *IntPtrType, const DataLayout *DL) { + if (V->getType() == DstType) + return V; + + BasicBlock *InsertAtEnd = dyn_cast(InstrOrBB); + Instruction *InsertBefore = dyn_cast(InstrOrBB); + BasicBlock *InsertBB = InsertAtEnd ? InsertAtEnd : InsertBefore->getParent(); + + CastInst *Result; + Type *OrigType = V->getType(); + + if (OrigType->isStructTy()) { + assert(DstType->isStructTy()); + assert(OrigType->getStructNumElements() == DstType->getStructNumElements()); + + IRBuilder<> Builder(InsertBB); + if (InsertBefore) + Builder.SetInsertPoint(InsertBefore); + Value *Result = UndefValue::get(DstType); + for (unsigned int I = 0, E = OrigType->getStructNumElements(); I < E; ++I) { + Value *ExtractedValue + = Builder.CreateExtractValue(V, ArrayRef(I)); + Value *Element = createCastIfNeeded(ExtractedValue, + DstType->getStructElementType(I), + InstrOrBB, IntPtrType, DL); + Result = + Builder.CreateInsertValue(Result, Element, ArrayRef(I)); + } + return Result; + } + assert(!DstType->isStructTy()); + + if (OrigType->isPointerTy() + && (DstType->isIntegerTy() || DstType->isPointerTy())) { + if (InsertBefore) + Result = CastInst::CreatePointerCast(V, DstType, "", InsertBefore); + else + Result = CastInst::CreatePointerCast(V, DstType, "", InsertAtEnd); + } else if (OrigType->isIntegerTy() && DstType->isPointerTy() + && OrigType == IntPtrType) { + // Int -> Ptr + if (InsertBefore) { + Result = CastInst::Create(CastInst::IntToPtr, V, DstType, "", + InsertBefore); + } else { + Result = CastInst::Create(CastInst::IntToPtr, V, DstType, "", + InsertAtEnd); + } + } else { + llvm_unreachable("Can only cast int -> ptr or ptr -> (ptr or int)"); + } + + assert(cast(Result)->isNoopCast(*DL) && + "Cast is not a no-op cast. Potential loss of precision"); + + return Result; +} + +namespace { + +/// ComparableFunction - A struct that pairs together functions with a +/// DataLayout so that we can keep them together as elements in the DenseSet. +class ComparableFunction { +public: + ComparableFunction() : Func(0), IsNew(false) { } + + ComparableFunction(const ComparableFunction &that) + : Func(that.Func), IsNew(that.IsNew) { + } + + ComparableFunction(Function *Func) : Func(Func), IsNew(true) { } + + ~ComparableFunction() { } + + ComparableFunction &operator=(const ComparableFunction &that) { + Func = that.Func; + IsNew = that.IsNew; + return *this; + } + + Function *getFunc() const { return Func; } + bool isNew() const { return IsNew; } + + // Drops AssertingVH reference to the function. Outside of debug mode, this + // does nothing. + void release() { + assert(Func && + "Attempted to release function twice, or release empty/tombstone!"); + Func = NULL; + } + + void markCompared() { + IsNew = false; + } +private: + AssertingVH Func; + bool IsNew; +}; + +} + +namespace { + +/// FunctionComparator - Compares two functions to determine whether or not +/// they will generate machine code with the same behaviour. DataLayout is +/// used if available. The comparator always fails conservatively (erring on the +/// side of claiming that two functions are different). +class FunctionComparator { +public: + FunctionComparator(const DataLayout *DL, Function *F1, Function *F2) + : isDifferent(false), isNotMergeable(false), + BasicBlockCount(0), InstructionCount(0), DifferingInstructionsCount(0), + F1(F1), F2(F2), SimilarityMetric(0), DL(DL), ID(CurID++) {} + + ~FunctionComparator() {} + + /// Test whether the two functions have equivalent behaviour. Returns true if + /// they are equal or can be merged, false if not. + bool compare(); + + /// Indicate whether the two functions are an exact match after comparison + bool isExactMatch(); + + /// Indicate whether the two functions candidates for merging after comparison + bool isMergeCandidate(); + + /// Get a similarity metric between the two functions. Higher means more + /// similar. + unsigned getSimilarityMetric() { + if (!SimilarityMetric) + SimilarityMetric = (unsigned)(((float)InstructionCount - + DifferingInstructionsCount)/InstructionCount*10000); + return SimilarityMetric; + } + + Function *getF1() { return F1; } + Function *getF2() { return F2; } + ValueToValueMapTy &getF1toF2Map() { return id_map; } + ValueToValueMapTy &getF2toF1Map() { return seen_values; } + const DataLayout *getDataLayout() { return DL; } + + /// Assign or look up previously assigned numbers for the two values, and + /// return whether the numbers are equal. Numbers are assigned in the order + /// visited. If NoSelfRef is set, F1 and F2 are not assigned to each other + /// (treated as 'equal'). + bool enumerate(const Value *V1, const Value *V2, bool NoSelfRef=false); + + /// Compare two Types, treating all pointer types as equal. + bool isEquivalentType(Type *Ty1, Type *Ty2) const; + + /// Instructions that differ between the two functions (F1's -> F2's inst). + MapVector DifferingInstructions; + + /// Instructions that reference F1/F2 itself (recursive calls etc.) + /// These may need special treatment when merging differing functions. + MapVector SelfRefInstructions; + + /// Return the unique ID for the object. + unsigned getID() { return ID; } + + bool isDifferent; + bool isNotMergeable; + + // Comparison statistics + unsigned BasicBlockCount; + unsigned InstructionCount; + unsigned DifferingInstructionsCount; + +private: + /// Test whether two basic blocks have equivalent behaviour. Returns true if + /// they are equal or can be merged, false if not. PHINodes are not compared + /// in this function, but added to the PHIsFound list for delayed processing. + bool compare(const BasicBlock *BB1, const BasicBlock *BB2, + std::list > *PHIsFound); + + /// Compare pairs of PHI nodes. Returns true if all pairs are equal or can + /// be merged, false if not. + bool comparePHIs( + const std::list > &PHIs); + + /// Compare two Instructions for equivalence, similar to + /// Instruction::isSameOperationAs but with modifications to the type + /// comparison. + bool isEquivalentOperation(const Instruction *I1, + const Instruction *I2) const; + + /// Compare two GEPs for equivalent pointer arithmetic. + bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2); + bool isEquivalentGEP(const GetElementPtrInst *GEP1, + const GetElementPtrInst *GEP2) { + return isEquivalentGEP(cast(GEP1), cast(GEP2)); + } + + // The two functions undergoing comparison. + Function *F1, *F2; + + unsigned SimilarityMetric; + + const DataLayout *DL; + + ValueToValueMapTy id_map; + ValueToValueMapTy seen_values; + + // Maintain a unique ID for each object. + static unsigned CurID; + unsigned ID; +}; + +} + +unsigned FunctionComparator::CurID = 0; + +// Any two pointers in the same address space are equivalent, intptr_t and +// pointers are equivalent. Otherwise, standard type equivalence rules apply. +bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const { + if (Ty1 == Ty2) + return true; + if (Ty1->getTypeID() != Ty2->getTypeID()) { + LLVMContext &Ctx = Ty1->getContext(); + if (isa(Ty1) && Ty2 == DL->getIntPtrType(Ctx)) return true; + if (isa(Ty2) && Ty1 == DL->getIntPtrType(Ctx)) return true; + return false; + } + + switch (Ty1->getTypeID()) { + default: + llvm_unreachable("Unknown type!"); + // Fall through in Release mode. + case Type::IntegerTyID: + case Type::VectorTyID: + // Ty1 == Ty2 would have returned true earlier. + return false; + + case Type::VoidTyID: + case Type::FloatTyID: + case Type::DoubleTyID: + case Type::X86_FP80TyID: + case Type::FP128TyID: + case Type::PPC_FP128TyID: + case Type::LabelTyID: + case Type::MetadataTyID: + return true; + + case Type::PointerTyID: { + PointerType *PTy1 = cast(Ty1); + PointerType *PTy2 = cast(Ty2); + return PTy1->getAddressSpace() == PTy2->getAddressSpace(); + } + + case Type::StructTyID: { + StructType *STy1 = cast(Ty1); + StructType *STy2 = cast(Ty2); + if (STy1->getNumElements() != STy2->getNumElements()) + return false; + + if (STy1->isPacked() != STy2->isPacked()) + return false; + + for (unsigned i = 0, e = STy1->getNumElements(); i != e; ++i) { + if (!isEquivalentType(STy1->getElementType(i), STy2->getElementType(i))) + return false; + } + return true; + } + + case Type::FunctionTyID: { + FunctionType *FTy1 = cast(Ty1); + FunctionType *FTy2 = cast(Ty2); + if (FTy1->getNumParams() != FTy2->getNumParams() || + FTy1->isVarArg() != FTy2->isVarArg()) + return false; + + if (!isEquivalentType(FTy1->getReturnType(), FTy2->getReturnType())) + return false; + + for (unsigned i = 0, e = FTy1->getNumParams(); i != e; ++i) { + if (!isEquivalentType(FTy1->getParamType(i), FTy2->getParamType(i))) + return false; + } + return true; + } + + case Type::ArrayTyID: { + ArrayType *ATy1 = cast(Ty1); + ArrayType *ATy2 = cast(Ty2); + return ATy1->getNumElements() == ATy2->getNumElements() && + isEquivalentType(ATy1->getElementType(), ATy2->getElementType()); + } + } +} + +// Determine whether the two operations are the same except that pointer-to-A +// and pointer-to-B are equivalent. This should be kept in sync with +// Instruction::isSameOperationAs. +bool FunctionComparator::isEquivalentOperation(const Instruction *I1, + const Instruction *I2) const { + // Differences from Instruction::isSameOperationAs: + // * replace type comparison with calls to isEquivalentType. + // * we test for I->hasSameSubclassOptionalData (nuw/nsw/tail) at the top + // * because of the above, we don't test for the tail bit on calls later on + if (I1->getOpcode() != I2->getOpcode() || + I1->getNumOperands() != I2->getNumOperands() || + !isEquivalentType(I1->getType(), I2->getType()) || + !I1->hasSameSubclassOptionalData(I2)) + return false; + + // We have two instructions of identical opcode and #operands. Check to see + // if all operands are the same type + for (unsigned i = 0, e = I1->getNumOperands(); i != e; ++i) + if (!isEquivalentType(I1->getOperand(i)->getType(), + I2->getOperand(i)->getType())) + return false; + + // Check special state that is a part of some instructions. + if (const LoadInst *LI = dyn_cast(I1)) { + const LoadInst *LI2 = cast(I2); + return LI->isVolatile() == LI2->isVolatile() && + LI->getAlignment() == LI2->getAlignment() && + LI->getOrdering() == LI2->getOrdering() && + LI->getSyncScopeID() == LI2->getSyncScopeID() && + LI->getMetadata(LLVMContext::MD_range) + == LI2->getMetadata(LLVMContext::MD_range); + } + if (const StoreInst *SI = dyn_cast(I1)) + return SI->isVolatile() == cast(I2)->isVolatile() && + SI->getAlignment() == cast(I2)->getAlignment() && + SI->getOrdering() == cast(I2)->getOrdering() && + SI->getSyncScopeID() == cast(I2)->getSyncScopeID(); + if (const AllocaInst *AI = dyn_cast(I1)) { + if (AI->getArraySize() != cast(I2)->getArraySize() || + AI->getAlignment() != cast(I2)->getAlignment()) + return false; + + // If size is known, I2 can be seen as equivalent to I1 if it allocates + // the same or less memory. + if (DL->getTypeAllocSize(AI->getAllocatedType()) + < DL->getTypeAllocSize(cast(I2)->getAllocatedType())) + return false; + + return true; + } + if (const CmpInst *CI = dyn_cast(I1)) + return CI->getPredicate() == cast(I2)->getPredicate(); + if (const CallInst *CI = dyn_cast(I1)) + return CI->getCallingConv() == cast(I2)->getCallingConv() && + CI->getAttributes() == cast(I2)->getAttributes(); + if (const InvokeInst *CI = dyn_cast(I1)) + return CI->getCallingConv() == cast(I2)->getCallingConv() && + CI->getAttributes() == cast(I2)->getAttributes(); + if (const InsertValueInst *IVI = dyn_cast(I1)) + return IVI->getIndices() == cast(I2)->getIndices(); + if (const ExtractValueInst *EVI = dyn_cast(I1)) + return EVI->getIndices() == cast(I2)->getIndices(); + if (const FenceInst *FI = dyn_cast(I1)) + return FI->getOrdering() == cast(I2)->getOrdering() && + FI->getSyncScopeID() == cast(I2)->getSyncScopeID(); + if (const AtomicCmpXchgInst *CXI = dyn_cast(I1)) { + const AtomicCmpXchgInst *CXI2 = cast(I2); + return CXI->isVolatile() == CXI2->isVolatile() && + CXI->isWeak() == CXI2->isWeak() && + CXI->getSuccessOrdering() == CXI2->getSuccessOrdering() && + CXI->getFailureOrdering() == CXI2->getFailureOrdering() && + CXI->getSyncScopeID() == CXI2->getSyncScopeID(); + } + if (const AtomicRMWInst *RMWI = dyn_cast(I1)) + return RMWI->getOperation() == cast(I2)->getOperation() && + RMWI->isVolatile() == cast(I2)->isVolatile() && + RMWI->getOrdering() == cast(I2)->getOrdering() && + RMWI->getSyncScopeID() == cast(I2)->getSyncScopeID(); + + return true; +} + +// Determine whether two GEP operations perform the same underlying arithmetic. +bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1, + const GEPOperator *GEP2) { + // When we have target data, we can reduce the GEP down to the value in bytes + // added to the address. + if (GEP1->hasAllConstantIndices() && GEP2->hasAllConstantIndices()) { + SmallVector Indices1(GEP1->idx_begin(), GEP1->idx_end()); + SmallVector Indices2(GEP2->idx_begin(), GEP2->idx_end()); + uint64_t Offset1 = DL->getIndexedOffsetInType(GEP1->getSourceElementType(), + Indices1); + uint64_t Offset2 = DL->getIndexedOffsetInType(GEP2->getSourceElementType(), + Indices2); + return Offset1 == Offset2; + } + + if (GEP1->getPointerOperand()->getType() != + GEP2->getPointerOperand()->getType()) + return false; + + if (GEP1->getNumOperands() != GEP2->getNumOperands()) + return false; + + for (unsigned i = 0, e = GEP1->getNumOperands(); i != e; ++i) { + if (!enumerate(GEP1->getOperand(i), GEP2->getOperand(i))) + return false; + } + + return true; +} + +// Compare two values used by the two functions under pair-wise comparison. If +// this is the first time the values are seen, they're added to the mapping so +// that we will detect mismatches on next use. +bool FunctionComparator::enumerate(const Value *V1, const Value *V2, + bool NoSelfRef/*=false*/) { + // Check for function @f1 referring to itself and function @f2 referring to + // itself. For compatibility with llvm's MergeFunctions, disallow referring to + // each other, or both referring to either of them. + if (!NoSelfRef && V1 == F1 && V2 == F2) + return true; + + // FIXME: This is very conservative for now, but keeping this for thinlto. + if (isa(V1) || isa(V2)) + return false; + if (const Constant *C1 = dyn_cast(V1)) { + if (V1 == V2) return true; + const Constant *C2 = dyn_cast(V2); + if (!C2) return false; + // TODO: constant expressions with GEP or references to F1 or F2. + if (C1->isNullValue() && C2->isNullValue() && + isEquivalentType(C1->getType(), C2->getType())) + return true; + // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1 + // then they must have equal bit patterns. Aggregate types cannot be + // bitcast. + if (C1->getType()->isAggregateType() || C2->getType()->isAggregateType()) + return false; + return C1->getType()->canLosslesslyBitCastTo(C2->getType()) && + C1 == ConstantExpr::getBitCast(const_cast(C2), C1->getType()); + } + + if (isa(V1) || isa(V2)) + return V1 == V2; + + // Check that V1 maps to V2. If we find a value that V1 maps to then we simply + // check whether it's equal to V2. When there is no mapping then we need to + // ensure that V2 isn't already equivalent to something else. For this + // purpose, we track the V2 values in a set. + + ValueToValueMapTy::iterator I = id_map.find(V1); + if (I != id_map.end()) + return V2 == I->second; + // FIXME: Const casts!!! + if (!seen_values.insert(std::make_pair(V2, const_cast(V1))).second) + return false; + id_map[V1] = const_cast(V2); + return true; +} + +/// Test whether two basic blocks have equivalent behaviour. Returns true if the +/// blocks can be merged, false if they cannot. Differing instructions are +/// recorded in DifferingInstructions. +bool FunctionComparator::compare(const BasicBlock *BB1, const BasicBlock *BB2, + std::list > *PHIsFound) { + BasicBlock::const_iterator F1I, F1E, F2I, F2E; + + for (F1I = BB1->begin(), F1E = BB1->end(), + F2I = BB2->begin(), F2E = BB2->end(); + F1I != F1E && F2I != F2E; ++F1I, ++F2I) { + // Skip debug information + const CallInst *DbgCall; + while (F1I != F1E && (DbgCall = dyn_cast(F1I)) && + DbgCall->getCalledFunction() && + DbgCall->getCalledFunction()->hasName() && + DbgCall->getCalledFunction()->getName().startswith("llvm.dbg.")) + ++F1I; + + while (F2I != F2E && (DbgCall = dyn_cast(F2I)) && + DbgCall->getCalledFunction() && + DbgCall->getCalledFunction()->hasName() && + DbgCall->getCalledFunction()->getName().startswith("llvm.dbg.")) + ++F2I; + + if (F1I == F1E || F2I == F2E) + break; + + // Ok, we're dealing with real instructions. Check a few cases that will + // prevent merging first. + const Instruction *F1In = &*F1I; + const Instruction *F2In = &*F2I; + + // Cannot merge insts that differ in whether they have uses + if (F1In->use_empty() != F2In->use_empty()) { + // TODO: Could implement merging for this case (would need to introduce a + // dummy value in the PHI node etc.) + return false; + } + + // Cannot merge insts whose types are non-equivalent + if (!isEquivalentType(F1In->getType(), F2In->getType())) { + return false; + } + + // TODO: Currently cannot merge InvokeInsts with differing result types + // that have uses. We cannot push up a bitcast into their block after + // them because they are terminators. Would need to insert an + // additional BB. + if (isa(F1In) && !F1In->use_empty() && + F1In->getType() != F2In->getType()) + return false; + + if (!enumerate(F1In, F2In)) + goto differing_instructions; + + if (const GetElementPtrInst *GEP1 = dyn_cast(F1In)) { + const GetElementPtrInst *GEP2 = dyn_cast(F2In); + if (!GEP2) + goto differing_instructions; + + if (!enumerate(GEP1->getPointerOperand(), GEP2->getPointerOperand())) + goto differing_instructions; + + if (!isEquivalentGEP(GEP1, GEP2)) + goto differing_instructions; + } else if (const PHINode *Phi1 = dyn_cast(F1In)) { + const PHINode *Phi2 = dyn_cast(F2In); + // We can't currently merge a PHI and non-PHI instruction + if (!Phi2) + return false; + + // We can't currently merge PHI nodes with different numbers of incoming + // values + if (F1In->getNumOperands() != F2In->getNumOperands()) + return false; + + // We need to treat PHI nodes specially. Their incoming values may be in a + // different order even if they are equivalent. We can't compare them + // until we've seen the incoming blocks and know which values are + // equivalent. Therefore postpone PHINode comparison until the end. + PHIsFound->push_back(std::make_pair(Phi1, Phi2)); + } else { + if (!isEquivalentOperation(F1In, F2In)) + goto differing_instructions; + + bool IsCall = isa(F1In); + assert(F1In->getNumOperands() == F2In->getNumOperands()); + for (unsigned i = 0, e = F1In->getNumOperands(); i != e; ++i) { + Value *OpF1 = F1In->getOperand(i); + Value *OpF2 = F2In->getOperand(i); + + // Allow self-reference if this is a call instruction and the last + // operand which is the called function + bool AllowSelfRef = IsCall && (i + 1) == e; + + if (!enumerate(OpF1, OpF2, !AllowSelfRef)) + goto differing_instructions; + + if (!isEquivalentType(OpF1->getType(), OpF2->getType())) + goto differing_instructions; + + if ((OpF1 == F1 && OpF2 == F2) || (OpF1 == F2 && OpF2 == F1)) + SelfRefInstructions[F1In] = F2In; + } + } + + continue; + +differing_instructions: + // Cannot merge functions with differing landing pad instructions yet. They + // would need special treatment which involves updating the corresponding + // invoke instructions. + if (isa(F1In)) + return false; + if (isa(F1In)) + return false; + + DifferingInstructions[F1In] = F2In; + } + + // We cannot currently merge basic blocks with different instruction counts + return F1I == F1E && F2I == F2E; +} + +bool FunctionComparator::comparePHIs( + const std::list > &PHIs) { + if (PHIs.empty()) + return true; + + for (std::list >::const_iterator + I = PHIs.begin(), E = PHIs.end(); I != E; ++I) { + const PHINode *Phi1 = I->first, *Phi2 = I->second; + + for (unsigned ValId = 0, ValNum = Phi1->getNumIncomingValues(); + ValId < ValNum; ++ValId) { + Value *Phi1Val = Phi1->getIncomingValue(ValId); + + // Get corresponding Phi2Val + Value *BBinPhi2Val = getF1toF2Map()[Phi1->getIncomingBlock(ValId)]; + + if (!BBinPhi2Val) + return false; // Currently can't handle differing predecessor blocks + + BasicBlock *BBinPhi2 = cast(BBinPhi2Val); + Value *Phi2Val = Phi2->getIncomingValueForBlock(BBinPhi2); + + // Enumerate the values. If the PHI node references the function itself (a + // very rare case), we mark it as different (NoSelfRef). This is only + // necessary for outline merging, not equiv merging. TODO: Make equal + // merging possible with such PHI nodes. + if (!enumerate(Phi1Val, Phi2Val,/*NoSelfRef=*/true)) { + DifferingInstructions[Phi1] = Phi2; + break; + } + } + } + + return true; +} + +// Test whether the two functions have equivalent behaviour. +bool FunctionComparator::compare() { + // We need to recheck everything, but check the things that weren't included + // in the hash first. + if (F1->getAttributes() != F2->getAttributes()) + goto not_mergeable; + + if (F1->hasGC() != F2->hasGC()) + goto not_mergeable; + + if (F1->hasGC() && F1->getGC() != F2->getGC()) + goto not_mergeable; + + if (!F1->getSection().equals(F2->getSection())) + goto not_mergeable; + + if (F1->isVarArg() != F2->isVarArg()) + goto not_mergeable; + + if (F1->isInterposable() != F2->isInterposable()) + goto not_mergeable; + + if (F1->size() != F2->size()) + goto not_mergeable; + + // TODO: if it's internal and only used in direct calls, we could handle + // this case too. + if (F1->getCallingConv() != F2->getCallingConv()) + goto not_mergeable; + + if (!isEquivalentType(F1->getFunctionType(), F2->getFunctionType())) + goto not_mergeable; + + assert(F1->arg_size() == F2->arg_size() && + "Identically typed functions have different numbers of args!"); + + // Visit the arguments so that they get enumerated in the order they're + // passed in. + for (Function::const_arg_iterator f1i = F1->arg_begin(), + f2i = F2->arg_begin(), f1e = F1->arg_end(); f1i != f1e; ++f1i, ++f2i) { + if (!enumerate(&*f1i, &*f2i)) + llvm_unreachable("Arguments repeat!"); + } + + // We do a CFG-ordered walk since the actual ordering of the blocks in the + // linked list is immaterial. Our walk starts at the entry block for both + // functions, then takes each block from each terminator in order. As an + // artifact, this also means that unreachable blocks are ignored. + { + SmallVector F1BBs, F2BBs; + SmallSet VisitedBBs; // in terms of F1. + std::list > PHIsFound; + + F1BBs.push_back(&F1->getEntryBlock()); + F2BBs.push_back(&F2->getEntryBlock()); + + VisitedBBs.insert(F1BBs[0]); + while (!F1BBs.empty()) { + const BasicBlock *F1BB = F1BBs.pop_back_val(); + const BasicBlock *F2BB = F2BBs.pop_back_val(); + + // Check for control flow divergence + if (!enumerate(F1BB, F2BB)) + goto not_mergeable; + + const Instruction *F1TI = F1BB->getTerminator(); + const Instruction *F2TI = F2BB->getTerminator(); + + // TODO: Implement merging of blocks with different numbers of + // instructions. + if (F1TI->getNumSuccessors() != F2TI->getNumSuccessors() || + F1BB->size() != F2BB->size()) + goto not_mergeable; + + // The actual instruction-by-instruction comparison + if (!compare(F1BB, F2BB, &PHIsFound)) + goto not_mergeable; + + // FIXME: Count this in compare(F1BB,F2BB) so it doesn't include debug + // instructions. + InstructionCount += std::max(F1BB->size(), F2BB->size()); + + assert(F1TI->getNumSuccessors() == F2TI->getNumSuccessors()); + for (unsigned i = 0, e = F1TI->getNumSuccessors(); i != e; ++i) { + if (!VisitedBBs.insert(F1TI->getSuccessor(i)).second) + continue; + + F1BBs.push_back(F1TI->getSuccessor(i)); + F2BBs.push_back(F2TI->getSuccessor(i)); + } + } + + BasicBlockCount = VisitedBBs.size(); + + // After we've seen all values and BBs, compare the PHI nodes + if (!comparePHIs(PHIsFound)) + goto not_mergeable; + } + + if (DifferingInstructions.size()) { + // Currently we can't merge vararg functions with differing instructions. + // TODO: Explore whether this is feasible; the difficult bit is the + // additional argument we need to add. + if (F1->isVarArg()) + goto not_mergeable; + + isDifferent = true; + DifferingInstructionsCount += DifferingInstructions.size(); + + LLVM_DEBUG(float Metric = ((float)InstructionCount - DifferingInstructionsCount) + / InstructionCount*100; + dbgs() << "Similar fns: " << F1->getName() << " and " << F2->getName() + << " bbs=" << BasicBlockCount << " insts=" << InstructionCount + << " failed=" << DifferingInstructionsCount << " metric=" + << format("%0.2f", Metric) + << '\n'); + } + + return true; + +not_mergeable: + // Fail: cannot merge the two functions + isNotMergeable = true; + return false; +} + +bool FunctionComparator::isExactMatch() { + return (!isNotMergeable && !isDifferent); +} + +bool FunctionComparator::isMergeCandidate() { + if (isNotMergeable) + return false; + + if (!isDifferent) + return true; + + // Heuristic when to attempt merging + if (InstructionCount > MergeDifferingMinInsts && + DifferingInstructionsCount <= MergeMaxDiffering && + getSimilarityMetric() > MergeMinSimilarity) + return true; + + // Tolerate higher difference with higher similarity. + if (InstructionCount > 100 && + DifferingInstructionsCount <= 60 && + getSimilarityMetric() > 90 ) + return true; + + return false; +} + +namespace { + +struct FunctionComparatorOrdering { + bool operator () (FunctionComparator *LHS, FunctionComparator *RHS) const { + unsigned MetricLHS = LHS->getSimilarityMetric(), + MetricRHS = RHS->getSimilarityMetric(); + + // If the metric is the same, then default to the unique ID. We need + // to use a unique value instead of the object address to ensure + // deterministic ordering. + if (MetricLHS == MetricRHS) + return LHS->getID() > RHS->getID(); + return MetricLHS > MetricRHS; + } +}; + +class MergeRegistry { +public: + typedef MapVector > FnCompareMap; + typedef std::set + FnComparatorSet; + typedef std::map SimilarFnMap; + + ~MergeRegistry(); + + void clear(); + + /// Defer a function for consideration in the next round. + void defer(Function *F); + + /// Return true if we have deferred functions that can be enqueued. + bool haveDeferred() { return !Deferred.empty(); } + + /// Move all the deferred functions into buckets to consider them for merging. + /// Returns number of functions that have been added. + unsigned enqueue(); + + /// Add a candidate for merging + void insertCandidate(FunctionComparator *Comp); + + /// Remove a Function from the FnSet and queue it up for a second sweep of + /// analysis if Reanalyze is set. If it is a candidate for merging, remove it + /// from consideration. + void remove(Function *F, bool Reanalyze=true); + + /// Return the similarity metric of the most similar function to F that is + /// not listed in the Ignore set. + unsigned getMaxSimilarity(Function *F, const DenseSet &Ignore); + + /// The collection of buckets that contain functions that may be similar to + /// each other (same hash value). + FnCompareMap FunctionsToCompare; + + std::list FunctionsToMerge; + SimilarFnMap SimilarFunctions; + +private: + typedef std::vector FnDeferredQueue; + + /// A work queue of functions that may have been modified and should be + /// analyzed again. + FnDeferredQueue Deferred; +}; + +} // end anonymous namespace + +MergeRegistry::~MergeRegistry() { + this->clear(); +} + +void MergeRegistry::clear() { + Deferred.clear(); + SimilarFunctions.clear(); + for (std::list::iterator + I = FunctionsToMerge.begin(), E = FunctionsToMerge.end(); + I != E; ++I) { + FunctionComparator *FC = *I; + delete FC; + } + FunctionsToMerge.clear(); + FunctionsToCompare.clear(); +} + +static bool isAliasCapable(Function* G) { + return + UseGlobalAliases && G->hasGlobalUnnamedAddr() + && (G->hasExternalLinkage() || G->hasLocalLinkage() || G->hasWeakLinkage()); +} + +static bool isComparisonCandidate(Function *F) { + if (Opt::MergeLevel == Opt::size) { + // Only consider functions that are to be optimized for size. + // By default, that is all functions at -Os/-Oz and nothing at -O2. + bool Os = F->getAttributes(). + hasAttribute(AttributeList::FunctionIndex, Attribute::OptimizeForSize); + bool Oz = F->getAttributes(). + hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); + if (!Os && !Oz) + return false; + } + + // Ignore declarations and tiny functions - no point in merging those + if (F->isDeclaration()) return false; + if (F->getName().endswith(MERGED_SUFFIX)) return false; + if (F->hasAvailableExternallyLinkage()) return false; + if (F->hasFnAttribute(Attribute::AlwaysInline)) return false; + if (F->size() == 1 && F->begin()->size() < MergeMinInsts) + return isAliasCapable(F); + + return true; +} + +void MergeRegistry::defer(Function *F) { + if (isComparisonCandidate(F)) + Deferred.push_back(F); +} + +// Move functions from Deferred into buckets. remove() may have been called +// multiple times for the same function, so eliminate duplicates using the +// set. We reverse them because MergeSimilarFunctions::insert inserts at the +// front of each bucket. +unsigned MergeRegistry::enqueue() { + DenseSet InsertedFuncs; + + for (std::vector::reverse_iterator + DefI = Deferred.rbegin(), DefE = Deferred.rend(); + DefI != DefE; ++DefI) { + Value *V = *DefI; + Function *F = dyn_cast_or_null(V); + if (!F) continue; + if (InsertedFuncs.find(F) != InsertedFuncs.end()) continue; + if (!isComparisonCandidate(F)) continue; + + unsigned Hash = profileFunction(F); + FunctionsToCompare[Hash].push_front(F); + + InsertedFuncs.insert(F); + } + + Deferred.clear(); + + return InsertedFuncs.size(); +} + +void MergeRegistry::insertCandidate(FunctionComparator *Comp) { + FunctionsToMerge.push_back(Comp); + SimilarFunctions[Comp->getF1()].insert(Comp); +} + +static void removeFromBucket(Function *F, + std::list &Bucket) { + for (std::list::iterator + I = Bucket.begin(), E = Bucket.end(); I != E; ++I) { + if (I->getFunc() == F) { + Bucket.erase(I); + return; + } + } +} + +void MergeRegistry::remove(Function *F, bool Reanalyze/*=true*/) { + // There is no need to remove a function that is not already + // in a bucket. + if (!isComparisonCandidate(F)) + return; + + unsigned Hash = profileFunction(F); + std::list &Bucket = FunctionsToCompare[Hash]; + + removeFromBucket(F, Bucket); + + if (Reanalyze) + Deferred.push_back(F); + + // Check whether we have any existing FunctionComparator objects for this fn. + // If yes, discard them because F has changed. Retry merging for those + // functions by adding them to Deferred. + std::list::iterator I = FunctionsToMerge.begin(); + while (I != FunctionsToMerge.end()) { + FunctionComparator *Comp = *I; + if (Comp->getF1() == F) { + Function *OtherF = Comp->getF2(); + Deferred.push_back(OtherF); + removeFromBucket(OtherF, Bucket); + if (!SimilarFunctions[F].erase(Comp)) + llvm_unreachable("Inconsistent SimilarFunctions set"); + I = FunctionsToMerge.erase(I); + delete Comp; + } else if (Comp->getF2() == F) { + Function *OtherF = Comp->getF1(); + Deferred.push_back(OtherF); + removeFromBucket(OtherF, Bucket); + if (!SimilarFunctions[OtherF].erase(Comp)) + llvm_unreachable("Inconsistent SimilarFunctions set"); + I = FunctionsToMerge.erase(I); + delete Comp; + } else { + ++I; + } + } +} + +unsigned MergeRegistry::getMaxSimilarity(Function *F, + const DenseSet &Ignore) { + FnComparatorSet &Similar = SimilarFunctions[F]; + + for (FnComparatorSet::iterator I = Similar.begin(), E = Similar.end(); + I != E; ++I) { + FunctionComparator *Comp = *I; + if (Ignore.count(Comp->getF2())) + continue; + + return Comp->getSimilarityMetric(); + } + + return 0; +} + +namespace { + +class MergeSimilarFunctions : public ModulePass { +public: + static char ID; + MergeSimilarFunctions(const ModuleSummaryIndex *Summary = nullptr) + : ModulePass(ID) { + initializeMergeSimilarFunctionsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M); + +private: + /// Find the functions that use this Value and remove them from FnSet and + /// queue the functions. + void removeUsers(Value *V); + + /// Replace all direct calls of Old with calls of New. Will bitcast New if + /// necessary to make types match. + void replaceDirectCallers(Function *Old, Function *New); + + /// Process functions in the specified bucket, by either doing equiv merging + /// marking them for diff merging. Returns false if the bucket needs to be + /// re-scanned after an equiv merge. Sets Changed if the module was changed by + /// equiv merge. + bool mergeBucket(std::list &Fns, bool &Changed); + + /// Exhaustively compare all functions in each bucket and do equiv merging + /// where possible. Functions that have already been compared will not be + /// compared again. Returns true if the module was modified. + bool doExhaustiveCompareMerge(); + + /// Merge all the functions marked for diff merging. Returns true if the + /// module was modified. + bool doDiffMerge(); + + /// Merge two equivalent functions. Upon completion, G may be deleted, or may + /// be converted into a thunk. In either case, it should never be visited + /// again. + void mergeTwoFunctions(Function *F, Function *G); + + /// Merge a set of functions with differences. + void outlineAndMergeFunctions(SmallVectorImpl &Fns); + + /// Replace G with a thunk or an alias to F. Deletes G. + void writeThunkOrAlias(Function *F, Function *G); + + /// Replace G with a simple tail call to bitcast(F). Also replace direct uses + /// of G with bitcast(F). Deletes G. + void writeThunk(Function *F, Function *G); + + /// Replace G with a tail call to F with an additional argument. + /// + void writeThunkWithChoice(Function *NewF, Function *OldF, int Choice); + + /// Replace G with an alias to F. Deletes G. + void writeAlias(Function *F, Function *G); + + /// DataLayout for more accurate GEP comparisons. May be NULL. + const DataLayout *DL; + + /// Merge registry. Stores all the information about functions being + /// considered for merging as well as current candidates for merging. + MergeRegistry Registry; + +}; + +} // end anonymous namespace + +char MergeSimilarFunctions::ID = 0; +INITIALIZE_PASS(MergeSimilarFunctions, "mergesimilarfunc", + "Merge Similar Functions", false, false) + +ModulePass * +llvm::createMergeSimilarFunctionsPass(const ModuleSummaryIndex *S) { + return new MergeSimilarFunctions(S); +} + +bool MergeSimilarFunctions::runOnModule(Module &M) { + if (Opt::MergeLevel == Opt::none) + return false; + + bool Changed = false; + + DL = &M.getDataLayout(); + + for (auto &I : M) + Registry.defer(&I); + + do { + unsigned InsertCount = Registry.enqueue(); + + LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n'); + LLVM_DEBUG(dbgs() << "size of worklist: " << InsertCount << '\n'); + (void)InsertCount; + + Changed |= doExhaustiveCompareMerge(); + } while (Registry.haveDeferred()); + + Changed |= doDiffMerge(); + + Registry.clear(); + return Changed; +} + +// Replace direct callers of Old with New. +void MergeSimilarFunctions::replaceDirectCallers(Function *Old, Function *New) { + Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType()); + for (Value::use_iterator UI = Old->use_begin(), UE = Old->use_end(); + UI != UE;) { + Use *U = &*UI; + ++UI; + CallSite CS(U->getUser()); + if (CS && CS.isCallee(U)) { + Registry.remove(CS.getInstruction()->getParent()->getParent()); + U->set(BitcastNew); + } + } +} + +// Replace G with an alias to F if possible, or else a thunk to F. Deletes G. +void MergeSimilarFunctions::writeThunkOrAlias(Function *F, Function *G) { + if (isAliasCapable(G)) { + writeAlias(F, G); + return; + } + + writeThunk(F, G); +} + +static void writeThunkBody(Function *Thunk, Function *F, + ConstantInt *Choice, const DataLayout *DL) { + BasicBlock *BB = &Thunk->getEntryBlock(); + IRBuilder<> Builder(BB); + + SmallVector Args; + unsigned i = 0; + FunctionType *FFTy = F->getFunctionType(); + Type *IntPtrTy = DL->getIntPtrType(FFTy->getContext()); + for (auto &AI : Thunk->args()) { + Value *Cast = createCastIfNeeded(&AI, FFTy->getParamType(i), BB, IntPtrTy, DL); + Args.push_back(Cast); + ++i; + } + if (Choice) + Args.push_back(Choice); + + CallInst *CI = Builder.CreateCall(F, Args); + CI->setTailCall(); + CI->setCallingConv(F->getCallingConv()); + CI->setAttributes(F->getAttributes()); + CI->setIsNoInline(); + if (Thunk->getReturnType()->isVoidTy()) { + Builder.CreateRetVoid(); + } else { + Type *RetTy = Thunk->getReturnType(); + if (CI->getType()->isIntegerTy() && RetTy->isPointerTy()) + Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy)); + else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy()) + Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy)); + else { + Value *Cast = createCastIfNeeded(CI, RetTy, BB, IntPtrTy, DL); + Builder.CreateRet(Cast); + } + } +} + +// Replace G with a simple tail call to bitcast(F). Also replace direct uses +// of G with bitcast(F). Deletes G. +void MergeSimilarFunctions::writeThunk(Function *F, Function *G) { + if (!G->isInterposable()) { + // Redirect direct callers of G to F. + replaceDirectCallers(G, F); + } + + // If G was internal then we may have replaced all uses of G with F. If so, + // stop here and delete G. There's no need for a thunk. + if (G->hasLocalLinkage() && G->use_empty()) { + LLVM_DEBUG(dbgs() << "All uses of " << G->getName() << " replaced by " + << F->getName() << ". Removing it.\n"); + G->eraseFromParent(); + return; + } + + Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "", + G->getParent()); + BasicBlock::Create(F->getContext(), "", NewG); + + writeThunkBody(NewG, F, nullptr, DL); + + NewG->copyAttributesFrom(G); + NewG->takeName(G); + removeUsers(G); + G->replaceAllUsesWith(NewG); + G->eraseFromParent(); + + LLVM_DEBUG(dbgs() << "writeThunk: " << NewG->getName() << " calling " + << F->getName() << '\n'); + ++NumThunksWritten; +} + +void MergeSimilarFunctions::writeThunkWithChoice(Function *NewF, Function *OldF, + int Choice) { + // Deleting the body of a function sets its linkage to External. Save the old + // one here and restore it at the end. + GlobalValue::LinkageTypes OldFLinkage = OldF->getLinkage(); + + // Delete OldF's body + OldF->deleteBody(); + BasicBlock::Create(OldF->getContext(), "", OldF); + + // Insert single BB with tail call + IntegerType *Int32Ty = Type::getInt32Ty(OldF->getContext()); + ConstantInt *ChoiceConst = ConstantInt::get(Int32Ty, Choice); + writeThunkBody(OldF, NewF, ChoiceConst, DL); + OldF->setLinkage(OldFLinkage); +} + +// Replace G with an alias to F and delete G. +void MergeSimilarFunctions::writeAlias(Function *F, Function *G) { + + // Replace all current uses of G in constants with F. This handles virtual + // table and other references. Do this first so that we don't modify thge + // global alias we're about to create. + SmallVector Uses; + for (auto I = G->use_begin(), E = G->use_end(); I != E; ++I) { + Use *U = I.operator->(); + Constant *CV = dyn_cast(U->getUser()); + if (!CV) continue; + Uses.push_back(U); + } + for (auto I = Uses.begin(), E= Uses.end(); I != E; ++I) { + Use *U = *I; + U->set(F); + } + + PointerType *PTy = G->getType(); + auto *GA = GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(), + G->getLinkage(), "", F); + F->setAlignment(std::max(F->getAlignment(), G->getAlignment())); + GA->takeName(G); + GA->setVisibility(G->getVisibility()); + removeUsers(G); + G->replaceAllUsesWith(GA); + G->eraseFromParent(); + + LLVM_DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n'); + ++NumAliasesWritten; +} + +// Merge two equivalent functions. Upon completion, Function G is deleted. +void MergeSimilarFunctions::mergeTwoFunctions(Function *F, Function *G) { + if (F->isInterposable()) { + assert(G->isInterposable()); + + if (UseGlobalAliases) { + // Make them both thunks to the same internal function. + Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "", + F->getParent()); + H->copyAttributesFrom(F); + H->takeName(F); + removeUsers(F); + F->replaceAllUsesWith(H); + + unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment()); + + writeAlias(F, G); + writeAlias(F, H); + + F->setAlignment(MaxAlignment); + F->setLinkage(GlobalValue::PrivateLinkage); + } else { + // We can't merge them. Instead, pick one and update all direct callers + // to call it and hope that we improve the instruction cache hit rate. + replaceDirectCallers(G, F); + } + + ++NumDoubleWeak; + } else { + writeThunkOrAlias(F, G); + } + + ++NumFunctionsMerged; +} + +static Value *getLastArg(Function *F) { + auto it = F->arg_begin(); + std::advance(it, F->arg_size()-1); + return it; +} + +static void insertCondAndRemapInstructions( + Instruction *F1InstInNewF, const std::vector &F2Insts, + Function *NewF, ValueToValueMapTy &F1toNewF, + const SmallVectorImpl &Comps, + Type *IntPtrTy, const DataLayout *DL) { + assert(F2Insts.size() == Comps.size() && + "Mis-match between F2Insts & Comps!"); + + SmallVector F2InstsInNewF; + for (unsigned FnI = 0, FnE = F2Insts.size(); FnI != FnE; ++FnI) { + const Instruction *F2Inst = F2Insts[FnI]; + if (!F2Inst) { + F2InstsInNewF.push_back(NULL); + continue; + } + + Instruction *F2InstInNewF = F2Inst->clone(); + + // Remap F2Inst: F2 values -> F1 values + RemapInstruction(F2InstInNewF, Comps[FnI]->getF2toF1Map(), + RF_NoModuleLevelChanges); + // Remap F2Inst: F1 values -> NewF values + RemapInstruction(F2InstInNewF, F1toNewF, RF_NoModuleLevelChanges); + + F2InstsInNewF.push_back(F2InstInNewF); + } + + SmallVector Terminators; + SplitBlockAndInsertSwitch(getLastArg(NewF), F1InstInNewF, + F2InstsInNewF, Terminators); + + assert(Terminators.size() == F2InstsInNewF.size() + 1 && + "Not enough terminators returned"); + + // F2InstsInNewF are now hooked up to the correct values in NewF. However, + // some of their operands may be pointers/integers so they could potentially + // have the wrong type in NewF (since we treat all pointers and integers of + // same size as equal). Insert casts if needed. + for (unsigned FnI = 0, FnE = F2InstsInNewF.size(); FnI != FnE; ++FnI) { + Instruction *F2InstInNewF = F2InstsInNewF[FnI]; + if (!F2InstInNewF) + continue; + const Instruction *F2Inst = F2Insts[FnI]; + + for (unsigned OpId=0; OpId < F2InstInNewF->getNumOperands(); ++OpId) { + Value *F2NewFOperand = F2InstInNewF->getOperand(OpId); + Value *F2OrigOperand = F2Inst->getOperand(OpId); + + if (F2NewFOperand->getType() != F2OrigOperand->getType()) { + Value *Cast = createCastIfNeeded(F2NewFOperand, + F2OrigOperand->getType(), + F2InstInNewF, + IntPtrTy, DL); + F2InstInNewF->setOperand(OpId, Cast); + } + } + } + + if (ReturnInst *F1Ret = dyn_cast(F1InstInNewF)) { + // If we're handling differing return instructions, we need to ensure that + // they all return the same type. Since we treat pointer types as equal, we + // may need to insert a bitcast. + for (Instruction *F2Inst : F2InstsInNewF) { + if (!F2Inst) + continue; + + // F2Inst must also be a return instruction due to control flow + // isomorphism. + ReturnInst *F2Ret = cast(F2Inst); + + if (F2Ret->getReturnValue()->getType() != + F1Ret->getReturnValue()->getType()) + F2Ret->setOperand(0, + createCastIfNeeded(F2Ret->getReturnValue(), + F1Ret->getReturnValue()->getType(), + F2Ret, IntPtrTy, DL)); + } + } else if (!F1InstInNewF->use_empty()) { + // If the instructions have uses, we need to insert a PHI node. + // + // We treat all pointer types as equal, so we may need to insert + // a bitcast to ensure that all incoming values of the PHI node have the + // same type + Type *F1IType = F1InstInNewF->getType(); + BasicBlock *TailBB = Terminators[0]->getSuccessor(0); + PHINode *Phi = + PHINode::Create(F1IType, F2InstsInNewF.size(), "", &TailBB->front()); + F1InstInNewF->replaceAllUsesWith(Phi); + + Phi->addIncoming(F1InstInNewF, F1InstInNewF->getParent()); + for (unsigned FnI = 0, FnE = F2InstsInNewF.size(); FnI != FnE; ++FnI) { + Instruction *F2InstInNewF = F2InstsInNewF[FnI]; + if (!F2InstInNewF) + continue; + + if (F2InstInNewF->getType() != F1IType) { + assert(!F2InstInNewF->isTerminator() && + "Cannot cast result of terminator instruction"); + + F2InstInNewF = cast( + createCastIfNeeded(F2InstInNewF, + F1IType, + Terminators[FnI+1], + IntPtrTy, DL)); + } + + Phi->addIncoming(F2InstInNewF, F2InstInNewF->getParent()); + } + } +} + +static void mergePHINode(const SmallVectorImpl &Fns, + Function *NewF, + ValueToValueMapTy &VMap, /* F1->FNew */ + const PHINode *F1PhiInst, + std::vector F2PhiInsts) { + PHINode *F1PhiInNewF = dyn_cast(VMap[F1PhiInst]); + assert(F1PhiInNewF && "Cannot find F1Inst in NewF!"); + + // The incoming blocks in any of the F2PhiInsts may be in a different order. + // If this is the case, we have to reorder them. F2PhiInsts is intentionally a + // copy, so we can modify it + SmallVector GCInsts; // so we can delete them later. + for (unsigned FnI = 0, FnE = F2PhiInsts.size(); FnI != FnE; ++FnI) { + const PHINode *F2PhiInst = dyn_cast_or_null(F2PhiInsts[FnI]); + if (!F2PhiInst) + continue; + + for (unsigned I = 0, E = F1PhiInNewF->getNumIncomingValues(); I < E; ++I) { + if (!Fns[FnI]->enumerate(F1PhiInst->getIncomingBlock(I), + F2PhiInst->getIncomingBlock(I))) { + // Non-equivalent blocks in the same position - need to reorder PhiInst + PHINode *ReorderedF2PhiInst = PHINode::Create(F2PhiInst->getType(), E); + + for (unsigned II = 0; II < E; ++II) { + Value *BBVal = + Fns[FnI]->getF1toF2Map()[F1PhiInst->getIncomingBlock(II)]; + BasicBlock *BB = cast(BBVal); + Value *Val = F2PhiInst->getIncomingValueForBlock(BB); + ReorderedF2PhiInst->addIncoming(Val, BB); + } + + F2PhiInsts[FnI] = ReorderedF2PhiInst; + GCInsts.push_back(ReorderedF2PhiInst); + break; + } + } + } + + // Now merge the PHI nodes. + for (unsigned i = 0; i < F1PhiInNewF->getNumIncomingValues(); ++i) { + Value *F1InValNewF = F1PhiInNewF->getIncomingValue(i), + *F1InVal = F1PhiInst->getIncomingValue(i); + BasicBlock *F1NewFInBlock = F1PhiInNewF->getIncomingBlock(i); + // If this is a repeat occurrence of the same incoming BasicBlock, we + // will have already dealt with it in a previous iteration. + if (F1PhiInNewF->getBasicBlockIndex(F1PhiInNewF->getIncomingBlock(i)) != + (int)i) + continue; + + Value *NewIncoming = F1InValNewF; + + Instruction *InsertPt = F1NewFInBlock->getTerminator(); + + // Build up a chain of cmps and selects that pick the correct incoming + // value. + for (unsigned FnI = 0, FnE = F2PhiInsts.size(); FnI != FnE; ++FnI) { + if (!F2PhiInsts[FnI]) + continue; + const PHINode *F2PhiInst = cast(F2PhiInsts[FnI]); + Value *F2InVal = F2PhiInst->getIncomingValue(i); + + // If we know these are equivalent, there's no further work to do + if (Fns[FnI]->enumerate(F1InVal, F2InVal,/*NoSelfRef=*/true) && + Fns[FnI]->enumerate(F1PhiInst->getIncomingBlock(i), + F2PhiInst->getIncomingBlock(i))) + continue; + + assert(Fns[FnI]->enumerate(F1PhiInst->getIncomingBlock(i), + F2PhiInst->getIncomingBlock(i)) && + "Non-equivalent incoming BBs in PHI."); + + // We have different incoming values from the same block + // Translate F2's incoming value to NewF if needed + Value *F2InValNewF = F2InVal; + if (!isa(F2InVal)) { + Value *V = Fns[FnI]->getF2toF1Map()[F2InVal]; // F2->F1 + F2InValNewF = VMap[V]; // F1->NewF + assert(V && F2InValNewF && "Cannot map F2InVal to NewF"); + } + + // Cast F2InValNewF to the correct type if needed + LLVMContext &Ctx = F1InValNewF->getType()->getContext(); + const DataLayout *FTD = Fns[FnI]->getDataLayout(); + Type *IntPtrTy = FTD ? FTD->getIntPtrType(Ctx) : NULL; + F2InValNewF = createCastIfNeeded(F2InValNewF, F1InValNewF->getType(), + InsertPt, IntPtrTy, FTD); + + // Create compare & select + Value *ChoiceArg = getLastArg(NewF); + Value *SelectBit = new ICmpInst(InsertPt, + ICmpInst::ICMP_EQ, + getLastArg(NewF), + ConstantInt::get(ChoiceArg->getType(), + FnI+1)); + + // SelectBit true -> F2InValNewF, SelectBit false -> existing NewIncoming. + NewIncoming = SelectInst::Create(SelectBit, F2InValNewF, NewIncoming, "", + InsertPt); + } + + if (NewIncoming == F1InValNewF) + continue; // no change for this incoming value + + // Replace all occurrences of this incoming value/block by the new + // ones (phi nodes can have repeated arguments) + for (unsigned j=i; j < F1PhiInNewF->getNumIncomingValues(); ++j) { + if (F1PhiInNewF->getIncomingBlock(j) == F1NewFInBlock) { + F1PhiInNewF->setIncomingValue(j, NewIncoming); + } + } + } + + // Garbage-collect the reordered PHI nodes we temporarily created. + for (SmallVectorImpl::iterator I = GCInsts.begin(), + E = GCInsts.end(); I != E; ++I) + delete *I; +} + +static bool rewriteRecursiveCall( + const CallInst *F1I, const CallInst *F2I, CallInst *NewFI, + const Function *F1, const Function *F2, Function *NewF) { + if (!(F1I->getCalledFunction() == F1 && F2I->getCalledFunction() == F2) && + !(F1I->getCalledFunction() == F2 && F2I->getCalledFunction() == F1)) + return false; // not a recursive/mutually recursive call + + // Replace NewFI by recursive call to NewF with additional choice argument + SmallVector Args; + for (unsigned AI = 0, End = NewFI->getNumArgOperands(); AI < End; ++AI) { + Value *Arg = NewFI->getArgOperand(AI); + + // Check if F1 or F2 is one of the arguments (veeery unusual case, don't + // handle it for now). + if (Arg == F1 || Arg == F2) + return false; + + Args.push_back(Arg); + } + + if (F1I->getCalledFunction() == F1 && F2I->getCalledFunction() == F2) { + Args.push_back(getLastArg(NewF)); + } else { + // Need to invert the choice argument + Value *ChoiceArg = getLastArg(NewF); + Constant *One = ConstantInt::get(ChoiceArg->getType(), 1); + Args.push_back(BinaryOperator::Create(Instruction::Xor, ChoiceArg, One, "", + NewFI)); + } + + CallInst *CI = CallInst::Create(NewF, Args); + CI->setCallingConv(NewF->getCallingConv()); + + ReplaceInstWithInst(NewFI, CI); + + return true; +} + +/// Clone F1 into a new function with F1's name + MERGE_SUFFIX. Adds an +/// additional i32 argument to the function. +static Function *cloneAndAddArgument(Function *F1, ValueToValueMapTy &VMap) { + LLVMContext &Context = F1->getContext(); + + std::vector ArgTypes; + for (const auto &Arg : F1->args()) + ArgTypes.push_back(Arg.getType()); + ArgTypes.push_back(Type::getInt32Ty(Context)); + + FunctionType *FTy = FunctionType::get(F1->getFunctionType()->getReturnType(), + ArgTypes, + F1->getFunctionType()->isVarArg()); + Function *NewF = Function::Create(FTy, F1->getLinkage(), + F1->getName()+MERGED_SUFFIX); + + insertFunctionAfter(NewF, F1); + + if (F1->hasSection()) + NewF->setSection(F1->getSection()); + + if (F1->getFunctionType()->isVarArg()) + NewF->setCallingConv(CallingConv::C); + else + NewF->setCallingConv(CallingConv::Fast); + + Function::arg_iterator DestI = NewF->arg_begin(); + for (auto &Arg : F1->args()) { + Argument *DestIn = &*DestI; + DestIn->setName(Arg.getName()); // Copy the name over... + VMap[&Arg] = DestIn; // Add mapping to VMap + ++DestI; + } + + // Name the selector argument + (*DestI).setName("__merge_arg"); + + SmallVector Returns; + CloneFunctionInto(NewF, F1, VMap, CloneType::ExtractingFunctions, Returns); + // Set linkage to set visibility to default. + NewF->setLinkage(GlobalValue::InternalLinkage); + + return NewF; +} + +typedef MapVector > + CombinedDiffMap; + +void MergeSimilarFunctions::outlineAndMergeFunctions( + SmallVectorImpl &Fns) { + assert(!Fns.empty() && "Cannot merge empty set of functions"); + + // All comparator instances in Fns share the same F1 + Function *F1 = Fns.front()->getF1(); + + // Clone F1 into new function with an additional i32 argument + ValueToValueMapTy VMap; // maps F1 values -> NewF values + Function *NewF = cloneAndAddArgument(F1, VMap); + + // Combine all the DifferingInstructions maps in Fns into one single map of + // lists to aid the merging process. + // + // Map F1 instruction -> list of F2 instructions indexed by position in Fns. + CombinedDiffMap AllDifferingInstructions; + for (unsigned I = 0, E = Fns.size(); I != E; ++I) { + FunctionComparator *Comp = Fns[I]; + for (MapVector::iterator + DiffI = Comp->DifferingInstructions.begin(), + DiffE = Comp->DifferingInstructions.end(); + DiffI != DiffE; ++DiffI) { + AllDifferingInstructions[DiffI->first].resize(Fns.size()); + AllDifferingInstructions[DiffI->first][I] = DiffI->second; + } + } + + // Merge differing PHI nodes. We need to handle these first because they could + // be affected later on when we split basic blocks, thus making them + // impossible to merge. + for (CombinedDiffMap::const_iterator I = AllDifferingInstructions.begin(), + E = AllDifferingInstructions.end(); + I != E; ++I) { + const PHINode *F1PhiInst = dyn_cast(I->first); + if (!F1PhiInst) + continue; + + const std::vector &F2PhiInsts = I->second; + + mergePHINode(Fns, NewF, VMap, F1PhiInst, F2PhiInsts); + } + + // Merge recursive calls + // + // TODO: We currently only support this optimization for pairs of functions. + // If more than two functions are merged, we mark the recursive calls as + // DifferingInstructions which causes switch statements to be inserted and + // recursive calls going through thunks. It wouldn't be too hard to implement + // self-recursive calls for multi-merges. *Mutually* recursive calls with + // multi-merges are a little trickier - that's why we leave it for now. + if (Fns.size() == 1) { + FunctionComparator *Comp = Fns.front(); + for (MapVector::const_iterator + I = Comp->SelfRefInstructions.begin(), + E = Comp->SelfRefInstructions.end(); + I != E; ++I) { + const Instruction *F1I = I->first; + if (Comp->DifferingInstructions.count(F1I)) + continue; // Differing in other ways too, so deal with it later. + + // Attempt recursive call rewriting + if (isa(F1I)) { + const CallInst *F1Call = cast(F1I); + const CallInst *F2Call = dyn_cast(I->second); + CallInst *NewFCall = dyn_cast(VMap[F1I]); + + if (F1Call && F2Call && NewFCall && + rewriteRecursiveCall(F1Call, F2Call, NewFCall, + Comp->getF1(), Comp->getF2(), NewF)) + continue; + } + + // Can't rewrite it. Mark as differing and insert conditional later + Comp->DifferingInstructions[F1I] = I->second; + } + } else { + for (unsigned I = 0, E = Fns.size(); I != E; ++I) { + FunctionComparator *Comp = Fns[I]; + for (MapVector::const_iterator + II = Comp->SelfRefInstructions.begin(), + EE = Comp->SelfRefInstructions.end(); + II != EE; ++II) { + const Instruction *F1I = II->first; + if (Comp->DifferingInstructions.count(F1I)) + continue; // Differing in other ways too, so deal with it later. + + AllDifferingInstructions[F1I].resize(Fns.size()); + AllDifferingInstructions[F1I][I] = II->second; + } + } + } + + // For each differing instruction, splice basic block, and insert conditional + LLVMContext &Context = NewF->getContext(); + Type *IntPtrType = DL->getIntPtrType(Context); + for (CombinedDiffMap::const_iterator I = AllDifferingInstructions.begin(), + E = AllDifferingInstructions.end(); + I != E; ++I) { + const Instruction *F1Inst = I->first; + const std::vector &F2Insts = I->second; + + assert(VMap.find(F1Inst) != VMap.end() && + "Cannot find differing inst!"); + Instruction *F1InstInNewF = cast(VMap[F1Inst]); + + if (isa(F1InstInNewF)) + continue; // we already handled these above + + insertCondAndRemapInstructions(F1InstInNewF, F2Insts, + NewF, VMap, Fns, IntPtrType, DL); + } + + // Replace functions with thunks + PrintMerges("FNSM", F1, NewF); + writeThunkWithChoice(NewF, F1, 0); + for (unsigned FnI = 0, FnE = Fns.size(); FnI != FnE; ++FnI) { + Function *F2 = Fns[FnI]->getF2(); + PrintMerges("FNSM", F2, NewF); + writeThunkWithChoice(NewF, F2, FnI + 1); + } + NumSimilarFunctionsMerged += Fns.size() + 1; +} + +// For each instruction used by the value, remove() the function that contains +// the instruction. This should happen right before a call to RAUW. +void MergeSimilarFunctions::removeUsers(Value *V) { + std::vector Worklist; + Worklist.push_back(V); + while (!Worklist.empty()) { + Value *V = Worklist.back(); + Worklist.pop_back(); + + for (User *U : V->users()) { + if (Instruction *I = dyn_cast(U)) { + Registry.remove(I->getParent()->getParent()); + } else if (isa(U)) { + // do nothing + } else if (Constant *C = dyn_cast(U)) { + for (User *UU : C->users()) + Worklist.push_back(UU); + } + } + } +} + +bool MergeSimilarFunctions::mergeBucket(std::list &Fns, + bool &Changed) { + for (std::list::iterator FnI = Fns.begin(), + FnE = Fns.end(); FnI != FnE; ++FnI) { + if (!FnI->isNew()) + continue; + + if (!FnI->getFunc()) + continue; + + SmallVector DiffMergeCandidates; + + std::list::iterator Fn2I = FnI; + for (++Fn2I; Fn2I != FnE; ++Fn2I) { + if (!Fn2I->getFunc()) + continue; + + assert(FnI->getFunc() != Fn2I->getFunc() && + "Duplicate function in list!"); + + FunctionComparator *Comp = new FunctionComparator(DL, FnI->getFunc(), + Fn2I->getFunc()); + + if (!Comp->compare() || !Comp->isMergeCandidate()) { + delete Comp; + continue; + } + + // Never thunk a strong function to a weak function. + assert(!FnI->getFunc()->isInterposable() || + Fn2I->getFunc()->isInterposable()); + + if (Comp->isExactMatch()) { + // Equiv merge the two functions. Throw away any diff merge + // candidate we might have found so far. + delete Comp; + + LLVM_DEBUG(dbgs() << "- Equiv merge " << FnI->getFunc()->getName() + << " == " << Fn2I->getFunc()->getName() << '\n'); + + PrintMerges("FNEQ", FnI->getFunc(), Fn2I->getFunc()); + + Function *DeleteF = Fn2I->getFunc(); + Registry.remove(DeleteF, /*reanalyze=*/false); + + mergeTwoFunctions(FnI->getFunc(), DeleteF); + + Changed = true; + + // mergeTwoFunctions may have removed functions from this bucket and + // invalidated the iterators. Rescan the whole bucket, continuing + // from the current function (previous ones will have been + // markCompared()) + for (SmallVector::iterator + I = DiffMergeCandidates.begin(), E = DiffMergeCandidates.end(); + I != E; ++I) + delete *I; + + return false; + } else { + DiffMergeCandidates.push_back(Comp); + } + } + + if (!DiffMergeCandidates.empty()) { + // Add to our list of candidates for diff merging + for (SmallVector::iterator + I = DiffMergeCandidates.begin(), E = DiffMergeCandidates.end(); + I != E; ++I) { + Registry.insertCandidate(*I); + } + } + + FnI->markCompared(); + } + + return true; +} + +bool MergeSimilarFunctions::doExhaustiveCompareMerge() { + bool Changed = false; + + // Process buckets with strong functions first. + for (MergeRegistry::FnCompareMap::iterator + BucketsI = Registry.FunctionsToCompare.begin(), + BucketsE = Registry.FunctionsToCompare.end(); + BucketsI != BucketsE; ++BucketsI) { + std::list &Fns = BucketsI->second; + if (Fns.size() < 2 || Fns.front().getFunc()->isInterposable()) + continue; + + LLVM_DEBUG(dbgs() << "Processing strong bucket " << BucketsI->first << " with " + << Fns.size() << " functions\n"); + // Repeatedly scan this bucket, until we find no more functions to equiv + // merge. + while (!mergeBucket(Fns, Changed) && Fns.size() > 1) { + LLVM_DEBUG(dbgs() << "Rescanning bucket.\n"); + } + } + + // Process buckets with weak functions. + for (MergeRegistry::FnCompareMap::iterator + BucketsI = Registry.FunctionsToCompare.begin(), + BucketsE = Registry.FunctionsToCompare.end(); + BucketsI != BucketsE; ++BucketsI) { + std::list &Fns = BucketsI->second; + if (Fns.size() < 2 || !Fns.front().getFunc()->isInterposable()) + continue; + + LLVM_DEBUG(dbgs() << "Processing weak bucket " << BucketsI->first << " with " + << Fns.size() << " functions\n"); + // Repeatedly scan this bucket, until we find no more functions to equiv + // merge. + while (!mergeBucket(Fns, Changed) && Fns.size() > 1) { + LLVM_DEBUG(dbgs() << "Rescanning bucket.\n"); + } + } + + return Changed; +} + +static bool orderComparatorsByMetric(FunctionComparator *Cmp1, + FunctionComparator *Cmp2) { + return (Cmp1->getSimilarityMetric() > Cmp2->getSimilarityMetric()); +} + +bool MergeSimilarFunctions::doDiffMerge() { + if (Registry.FunctionsToMerge.empty()) + return false; + + bool Changed = false; + DenseSet MergedFns; // Functions that have already been merged + Registry.FunctionsToMerge.sort(orderComparatorsByMetric); + + for (std::list::iterator + I = Registry.FunctionsToMerge.begin(), + E = Registry.FunctionsToMerge.end(); + I != E; ++I) { + FunctionComparator *Comp = *I; + Function *F1 = Comp->getF1(); + // Ignore it if we've already merged this fn + if (MergedFns.count(F1) || MergedFns.count(Comp->getF2())) + continue; + + assert(Registry.SimilarFunctions.count(F1) && + "Comparator doesn't exist in SimilarFunctions map"); + + // Look at all functions F that F1 could be merged with. Merge with each F, + // unless there is another function F' that is more similar to F than F1. + MergeRegistry::FnComparatorSet &SimilarFns = Registry.SimilarFunctions[F1]; + SmallVector CurrentMerge; + + for (MergeRegistry::FnComparatorSet::iterator + CandidateI = SimilarFns.begin(), CandidateE = SimilarFns.end(); + CandidateI != CandidateE; ++CandidateI) { + FunctionComparator *Comp2 = *CandidateI; + assert(Comp2->getF1() == F1 && "Inconsistency in SimilarFunctions"); + Function *F2 = Comp2->getF2(); + + // Ignore it if we've already merged this fn + if (MergedFns.count(F2)) + continue; + + // Check whether there is a better merge candidate for F2 + if (Registry.getMaxSimilarity(F2, MergedFns) > + Comp2->getSimilarityMetric()) + continue; + + // Ok, we actually want to merge with F2 + CurrentMerge.push_back(Comp2); + MergedFns.insert(F2); + } + + if (CurrentMerge.empty()) + continue; + + MergedFns.insert(F1); + + NumMultiMerged += CurrentMerge.size(); + + LLVM_DEBUG(dbgs() << "- Multi merge of " << F1->getName() << " with " + << CurrentMerge.size() << " functions.\n"); + + Changed = true; + outlineAndMergeFunctions(CurrentMerge); + } + + return Changed; +} diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -153,6 +153,10 @@ EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, cl::desc("Enable lowering of the matrix intrinsics")); +static cl::opt EnableMergeSimilarFunctions( + "enable-merge-sim-functions", cl::init(false), cl::Hidden, + cl::desc("Enable the Function merging pass (default = on)")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -609,6 +613,11 @@ MPM.add(createOpenMPOptLegacyPass()); MPM.add(createPostOrderFunctionAttrsLegacyPass()); + if (EnableMergeSimilarFunctions) { + auto *Summary = (ImportSummary ? ImportSummary : ExportSummary); + MPM.add(createMergeSimilarFunctionsPass(Summary)); + } + if (OptLevel > 2) MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args @@ -820,6 +829,9 @@ if (MergeFunctions) MPM.add(createMergeFunctionsPass()); + if (EnableMergeSimilarFunctions) + MPM.add(createMergeSimilarFunctionsPass()); + // LoopSink pass sinks instructions hoisted by LICM, which serves as a // canonicalization pass that enables other optimizations. As a result, // LoopSink pass needs to be a very late IR pass to avoid undoing LICM @@ -1047,6 +1059,8 @@ // currently it damages debug info. if (MergeFunctions) PM.add(createMergeFunctionsPass()); + if (EnableMergeSimilarFunctions) + PM.add(createMergeSimilarFunctionsPass()); } void PassManagerBuilder::populateThinLTOPassManager( diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4566,6 +4566,10 @@ if (I) return eraseInstFromFunction(*I); } + if (!Call.use_empty() && !Call.isMustTailCall()) + if (Value *ReturnedArg = Call.getReturnedArgOperand()) + return replaceInstUsesWith(Call, ReturnedArg); + if (isAllocLikeFn(&Call, &TLI)) return visitAllocSite(Call); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2760,6 +2760,12 @@ return nullptr; } +static bool isMustTailCall(Value *V) { + if (auto *CI = dyn_cast(V)) + return CI->isMustTailCall(); + return false; +} + Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) { if (RI.getNumOperands() == 0) // ret void return nullptr; @@ -2769,6 +2775,10 @@ if (!VTy->isIntegerTy() || isa(ResultOp)) return nullptr; + // Don't replace result of musttail calls. + if (isMustTailCall(ResultOp)) + return nullptr; + // There might be assume intrinsics dominating this return that completely // determine the value. If so, constant fold it. KnownBits Known = computeKnownBits(ResultOp, 0, &RI); @@ -3484,7 +3494,8 @@ // In general, it is possible for computeKnownBits to determine all bits in // a value even when the operands are not all constants. Type *Ty = I->getType(); - if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) { + if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy() && + !isMustTailCall(I)) { KnownBits Known = computeKnownBits(I, /*Depth*/0, I); if (Known.isConstant()) { Constant *C = ConstantInt::get(Ty, Known.getConstant()); diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -42,7 +43,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -1487,6 +1487,9 @@ SmallPtrSet InvisibleToCaller; // Keep track of blocks with throwing instructions not modeled in MemorySSA. SmallPtrSet ThrowingBlocks; + // Post-order numbers for each basic block. Used to figure out if memory + // accesses are executed before another access. + DenseMap PostOrderNumbers; /// Keep track of instructions (partly) overlapping with killing MemoryDefs per /// basic block. @@ -1502,23 +1505,28 @@ DSEState State(F, AA, MSSA, DT, PDT, TLI); // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. - for (Instruction &I : instructions(F)) { - if (I.mayThrow() && !MSSA.getMemoryAccess(&I)) - State.ThrowingBlocks.insert(I.getParent()); - - auto *MD = dyn_cast_or_null(MSSA.getMemoryAccess(&I)); - if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && - hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I)) - State.MemDefs.push_back(MD); - - // Track alloca and alloca-like objects. Here we care about objects not - // visible to the caller during function execution. Alloca objects are - // invalid in the caller, for alloca-like objects we ensure that they are - // not captured throughout the function. - if (isa(&I) || - (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true))) - State.InvisibleToCaller.insert(&I); + unsigned PO = 0; + for (BasicBlock *BB : post_order(&F)) { + State.PostOrderNumbers[BB] = PO++; + for (Instruction &I : *BB) { + if (I.mayThrow() && !MSSA.getMemoryAccess(&I)) + State.ThrowingBlocks.insert(I.getParent()); + + auto *MD = dyn_cast_or_null(MSSA.getMemoryAccess(&I)); + if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && + hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I)) + State.MemDefs.push_back(MD); + + // Track alloca and alloca-like objects. Here we care about objects not + // visible to the caller during function execution. Alloca objects are + // invalid in the caller, for alloca-like objects we ensure that they + // are not captured throughout the function. + if (isa(&I) || + (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true))) + State.InvisibleToCaller.insert(&I); + } } + // Treat byval or inalloca arguments the same as Allocas, stores to them are // dead at the end of the function. for (Argument &AI : F.args()) @@ -1593,16 +1601,13 @@ // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no // read access in between or return None otherwise. The returned value may not // (completely) overwrite \p DefLoc. Currently we bail out when we encounter - // any of the following - // * An aliasing MemoryUse (read). - // * A MemoryPHI. + // an aliasing MemoryUse (read). Optional getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current, MemoryLocation DefLoc, bool DefVisibleToCaller, int &ScanLimit) const { - MemoryDef *DomDef; - MemoryAccess *StartDef = Current; + MemoryAccess *DomAccess; bool StepAgain; LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current << "\n"); @@ -1613,37 +1618,44 @@ if (MSSA.isLiveOnEntryDef(Current)) return None; - MemoryUseOrDef *CurrentUD = dyn_cast(Current); - if (!CurrentUD) - return None; - + if (isa(Current)) { + DomAccess = Current; + break; + } + MemoryUseOrDef *CurrentUD = cast(Current); // Look for access that clobber DefLoc. - MemoryAccess *DomAccess = - MSSA.getSkipSelfWalker()->getClobberingMemoryAccess( - CurrentUD->getDefiningAccess(), DefLoc); - DomDef = dyn_cast(DomAccess); - if (!DomDef || MSSA.isLiveOnEntryDef(DomDef)) + DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD, + DefLoc); + if (MSSA.isLiveOnEntryDef(DomAccess)) return None; + if (isa(DomAccess)) + break; + // Check if we can skip DomDef for DSE. We also require the KillingDef // execute whenever DomDef executes and use post-dominance to ensure that. - if (canSkipDef(DomDef, DefVisibleToCaller) || + + MemoryDef *DomDef = dyn_cast(DomAccess); + if ((DomDef && canSkipDef(DomDef, DefVisibleToCaller)) || !PDT.dominates(KillingDef->getBlock(), DomDef->getBlock())) { StepAgain = true; - Current = DomDef; + Current = DomDef->getDefiningAccess(); } } while (StepAgain); - LLVM_DEBUG(dbgs() << " Checking for reads of " << *DomDef << " (" - << *DomDef->getMemoryInst() << ")\n"); + LLVM_DEBUG({ + dbgs() << " Checking for reads of " << *DomAccess; + if (isa(DomAccess)) + dbgs() << " (" << *cast(DomAccess)->getMemoryInst() << ")\n"; + }); SmallSetVector WorkList; auto PushMemUses = [&WorkList](MemoryAccess *Acc) { for (Use &U : Acc->uses()) WorkList.insert(cast(U.getUser())); }; - PushMemUses(DomDef); + PushMemUses(DomAccess); // Check if DomDef may be read. for (unsigned I = 0; I < WorkList.size(); I++) { @@ -1655,10 +1667,9 @@ return None; } - // Bail out on MemoryPhis for now. if (isa(UseAccess)) { - LLVM_DEBUG(dbgs() << " ... hit MemoryPhi\n"); - return None; + PushMemUses(UseAccess); + continue; } Instruction *UseInst = cast(UseAccess)->getMemoryInst(); @@ -1676,7 +1687,11 @@ return None; } - if (StartDef == UseAccess) + // For the KillingDef we only have to check if it reads the memory + // location. + // TODO: It would probably be better to check for self-reads before + // calling the function. + if (KillingDef == UseAccess) continue; // Check all uses for MemoryDefs, except for defs completely overwriting @@ -1695,8 +1710,8 @@ } } - // No aliasing MemoryUses of DomDef found, DomDef is potentially dead. - return {DomDef}; + // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead. + return {DomAccess}; } // Delete dead memory defs @@ -1788,10 +1803,10 @@ DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { - MemoryDef *Current = State.MemDefs[I]; - if (State.SkipStores.count(Current)) + MemoryDef *KillingDef = State.MemDefs[I]; + if (State.SkipStores.count(KillingDef)) continue; - Instruction *SI = cast(Current)->getMemoryInst(); + Instruction *SI = KillingDef->getMemoryInst(); auto MaybeSILoc = State.getLocForWriteEx(SI); if (!MaybeSILoc) { LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for " @@ -1808,22 +1823,54 @@ !PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT)))) DefVisibleToCaller = false; - LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " << *SI - << "\n"); + MemoryAccess *Current = KillingDef; + LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " + << *KillingDef << " (" << *SI << ")\n"); int ScanLimit = MemorySSAScanLimit; - MemoryDef *StartDef = Current; - // Walk MemorySSA upward to find MemoryDefs that might be killed by SI. - while (Optional Next = State.getDomMemoryDef( - StartDef, Current, SILoc, DefVisibleToCaller, ScanLimit)) { + // Worklist of MemoryAccesses that may be killed by KillingDef. + SetVector ToCheck; + ToCheck.insert(KillingDef->getDefiningAccess()); + + // Check if MemoryAccesses in the worklist are killed by KillingDef. + for (unsigned I = 0; I < ToCheck.size(); I++) { + Current = ToCheck[I]; + if (State.SkipStores.count(Current)) + continue; + + Optional Next = State.getDomMemoryDef( + KillingDef, Current, SILoc, DefVisibleToCaller, ScanLimit); + + if (!Next) { + LLVM_DEBUG(dbgs() << " finished walk\n"); + continue; + } + MemoryAccess *DomAccess = *Next; LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess << "\n"); + if (isa(DomAccess)) { + for (Value *V : cast(DomAccess)->incoming_values()) { + MemoryAccess *IncomingAccess = cast(V); + BasicBlock *IncomingBlock = IncomingAccess->getBlock(); + BasicBlock *PhiBlock = DomAccess->getBlock(); + + // We only consider incoming MemoryAccesses that come before the + // MemoryPhi. Otherwise we could discover candidates that do not + // strictly dominate our starting def. + if (State.PostOrderNumbers[IncomingBlock] > + State.PostOrderNumbers[PhiBlock]) + ToCheck.insert(IncomingAccess); + } + continue; + } MemoryDef *NextDef = dyn_cast(DomAccess); Instruction *NI = NextDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " def " << *NI << "\n"); - if (!hasAnalyzableMemoryWrite(NI, TLI)) - break; + if (!hasAnalyzableMemoryWrite(NI, TLI)) { + LLVM_DEBUG(dbgs() << " skip, cannot analyze def\n"); + continue; + } if (!isRemovable(NI)) { LLVM_DEBUG(dbgs() << " skip, cannot remove def\n"); @@ -1834,14 +1881,14 @@ // Check for anything that looks like it will be a barrier to further // removal if (State.isDSEBarrier(SI, SILoc, SILocUnd, NI, NILoc)) { - LLVM_DEBUG(dbgs() << " stop, barrier\n"); - break; + LLVM_DEBUG(dbgs() << " skip, barrier\n"); + continue; } // Before we try to remove anything, check for any extra throwing // instructions that block us from DSEing if (State.mayThrowBetween(SI, NI, SILocUnd)) { - LLVM_DEBUG(dbgs() << " stop, may throw!\n"); + LLVM_DEBUG(dbgs() << " skip, may throw!\n"); break; } @@ -1857,14 +1904,14 @@ OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset, InstWriteOffset, NI, IOL, AA, &F); + ToCheck.insert(NextDef->getDefiningAccess()); if (OR == OW_Complete) { LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI << "\n KILLER: " << *SI << '\n'); State.deleteDeadInstruction(NI); ++NumFastStores; MadeChange = true; - } else - Current = NextDef; + } } } diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -165,66 +165,85 @@ } }; - /// Wrapper class representing a matrix as a set of column vectors. - /// All column vectors must have the same vector type. - class ColumnMatrixTy { - SmallVector Columns; + /// Wrapper class representing a matrix as a set of vectors, either in row or + /// column major layout. All vectors must have the same vector type. + class MatrixTy { + SmallVector Vectors; OpInfoTy OpInfo; + bool IsColumnMajor = true; + public: - ColumnMatrixTy() : Columns() {} - ColumnMatrixTy(ArrayRef Cols) - : Columns(Cols.begin(), Cols.end()) {} + MatrixTy() : Vectors() {} + MatrixTy(ArrayRef Vectors) + : Vectors(Vectors.begin(), Vectors.end()) {} + + Value *getVector(unsigned i) const { return Vectors[i]; } + Value *getColumn(unsigned i) const { + assert(isColumnMajor() && "only supported for column-major matrixes"); + return Vectors[i]; + } - Value *getColumn(unsigned i) const { return Columns[i]; } + void setColumn(unsigned i, Value *V) { Vectors[i] = V; } - void setColumn(unsigned i, Value *V) { Columns[i] = V; } + Type *getElementType() { return getVectorTy()->getElementType(); } - Type *getElementType() { - return cast(Columns[0]->getType())->getElementType(); + unsigned getNumColumns() const { + if (isColumnMajor()) + return Vectors.size(); + else { + assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); + return cast(Vectors[0]->getType())->getNumElements(); + } } - - unsigned getNumColumns() const { return Columns.size(); } unsigned getNumRows() const { - assert(Columns.size() > 0 && "Cannot call getNumRows without columns"); - return cast(Columns[0]->getType())->getNumElements(); + if (isColumnMajor()) { + assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); + return cast(Vectors[0]->getType())->getNumElements(); + } else + return Vectors.size(); } - const SmallVectorImpl &getColumnVectors() const { return Columns; } + const SmallVectorImpl &getColumnVectors() const { return Vectors; } - SmallVectorImpl &getColumnVectors() { return Columns; } + SmallVectorImpl &getColumnVectors() { return Vectors; } - void addColumn(Value *V) { Columns.push_back(V); } + void addColumn(Value *V) { Vectors.push_back(V); } VectorType *getColumnTy() { - return cast(Columns[0]->getType()); + assert(isColumnMajor() && "only supported for column-major matrixes"); + return getVectorTy(); + } + + VectorType *getVectorTy() { + return cast(Vectors[0]->getType()); } iterator_range::iterator> columns() { - return make_range(Columns.begin(), Columns.end()); + return make_range(Vectors.begin(), Vectors.end()); } /// Embed the columns of the matrix into a flat vector by concatenating /// them. Value *embedInVector(IRBuilder<> &Builder) const { - return Columns.size() == 1 ? Columns[0] - : concatenateVectors(Builder, Columns); + return Vectors.size() == 1 ? Vectors[0] + : concatenateVectors(Builder, Vectors); } - ColumnMatrixTy &addNumLoads(unsigned N) { + MatrixTy &addNumLoads(unsigned N) { OpInfo.NumLoads += N; return *this; } void setNumLoads(unsigned N) { OpInfo.NumLoads = N; } - ColumnMatrixTy &addNumStores(unsigned N) { + MatrixTy &addNumStores(unsigned N) { OpInfo.NumStores += N; return *this; } - ColumnMatrixTy &addNumComputeOps(unsigned N) { + MatrixTy &addNumComputeOps(unsigned N) { OpInfo.NumComputeOps += N; return *this; } @@ -234,6 +253,8 @@ unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; } const OpInfoTy &getOpInfo() const { return OpInfo; } + + bool isColumnMajor() const { return IsColumnMajor; } }; struct ShapeInfo { @@ -274,7 +295,7 @@ SmallVector ToRemove; /// Map from instructions to their produced column matrix. - MapVector Inst2ColumnMatrix; + MapVector Inst2ColumnMatrix; public: LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI, @@ -300,8 +321,8 @@ /// If we lowered \p MatrixVal, just return the cache result column matrix. /// Otherwie split the flat vector \p MatrixVal containing a matrix with /// shape \p SI into column vectors. - ColumnMatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, - IRBuilder<> &Builder) { + MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, + IRBuilder<> &Builder) { VectorType *VType = dyn_cast(MatrixVal->getType()); assert(VType && "MatrixVal must be a vector type"); assert(VType->getNumElements() == SI.NumRows * SI.NumColumns && @@ -313,7 +334,7 @@ // vector and split it later. auto Found = Inst2ColumnMatrix.find(MatrixVal); if (Found != Inst2ColumnMatrix.end()) { - ColumnMatrixTy &M = Found->second; + MatrixTy &M = Found->second; // Return the found matrix, if its shape matches the requested shape // information if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns()) @@ -640,11 +661,11 @@ /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between /// columns. - ColumnMatrixTy loadMatrix(Type *Ty, Value *Ptr, Value *Stride, - ShapeInfo Shape, IRBuilder<> &Builder) { + MatrixTy loadMatrix(Type *Ty, Value *Ptr, Value *Stride, ShapeInfo Shape, + IRBuilder<> &Builder) { auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); - ColumnMatrixTy Result; + MatrixTy Result; // Distance between start of one column and the start of the next for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) { Value *GEP = @@ -659,9 +680,9 @@ /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix, /// starting at \p MatrixPtr[I][J]. - ColumnMatrixTy loadMatrix(Value *MatrixPtr, ShapeInfo MatrixShape, unsigned I, - unsigned J, ShapeInfo ResultShape, Type *EltTy, - IRBuilder<> &Builder) { + MatrixTy loadMatrix(Value *MatrixPtr, ShapeInfo MatrixShape, unsigned I, + unsigned J, ShapeInfo ResultShape, Type *EltTy, + IRBuilder<> &Builder) { Value *Offset = Builder.CreateAdd( Builder.CreateMul(Builder.getInt32(J), @@ -703,7 +724,7 @@ /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p /// MatrixPtr[I][J]. - void storeMatrix(const ColumnMatrixTy &StoreVal, Value *MatrixPtr, + void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr, ShapeInfo MatrixShape, unsigned I, unsigned J, Type *EltTy, IRBuilder<> &Builder) { Value *Offset = Builder.CreateAdd( @@ -727,8 +748,8 @@ /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between /// columns. - ColumnMatrixTy storeMatrix(Type *Ty, ColumnMatrixTy StoreVal, Value *Ptr, - Value *Stride, IRBuilder<> &Builder) { + MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr, Value *Stride, + IRBuilder<> &Builder) { auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); for (auto C : enumerate(StoreVal.columns())) { @@ -737,8 +758,8 @@ VType->getElementType(), Builder); createColumnStore(C.value(), GEP, VType->getElementType(), Builder); } - return ColumnMatrixTy().addNumStores(getNumOps(StoreVal.getColumnTy()) * - StoreVal.getNumColumns()); + return MatrixTy().addNumStores(getNumOps(StoreVal.getColumnTy()) * + StoreVal.getNumColumns()); } /// Lower a store instruction with shape information. @@ -764,7 +785,7 @@ /// Extract a column vector of \p NumElts starting at index (\p I, \p J) from /// the matrix \p LM represented as a vector of column vectors. - Value *extractVector(const ColumnMatrixTy &LM, unsigned I, unsigned J, + Value *extractVector(const MatrixTy &LM, unsigned I, unsigned J, unsigned NumElts, IRBuilder<> &Builder) { Value *Col = LM.getColumn(J); Value *Undef = UndefValue::get(Col->getType()); @@ -836,7 +857,7 @@ /// cached value when they are lowered. For other users, \p Matrix is /// flattened and the uses are updated to use it. Also marks \p Inst for /// deletion. - void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix, + void finalizeLowering(Instruction *Inst, MatrixTy Matrix, IRBuilder<> &Builder) { Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix)); @@ -854,9 +875,8 @@ /// Compute Res += A * B for tile-sized matrices with left-associating /// addition. - void emitChainedMatrixMultiply(ColumnMatrixTy &Result, - const ColumnMatrixTy &A, - const ColumnMatrixTy &B, bool AllowContraction, + void emitChainedMatrixMultiply(MatrixTy &Result, const MatrixTy &A, + const MatrixTy &B, bool AllowContraction, IRBuilder<> &Builder, bool isTiled) { const unsigned VF = std::max( TTI.getRegisterBitWidth(true) / @@ -902,17 +922,15 @@ ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); - const ColumnMatrixTy &Lhs = - getMatrix(MatMul->getArgOperand(0), LShape, Builder); - const ColumnMatrixTy &Rhs = - getMatrix(MatMul->getArgOperand(1), RShape, Builder); + const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder); + const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder); const unsigned R = LShape.NumRows; const unsigned C = RShape.NumColumns; assert(LShape.NumColumns == RShape.NumRows); // Initialize the output - ColumnMatrixTy Result; + MatrixTy Result; for (unsigned J = 0; J < C; ++J) Result.addColumn(UndefValue::get(VectorType::get(EltType, R))); @@ -924,12 +942,12 @@ /// Lowers llvm.matrix.transpose. void LowerTranspose(CallInst *Inst) { - ColumnMatrixTy Result; + MatrixTy Result; IRBuilder<> Builder(Inst); Value *InputVal = Inst->getArgOperand(0); VectorType *VectorTy = cast(InputVal->getType()); ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2)); - ColumnMatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); + MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) { // Build a single column vector for this row. First initialize it. @@ -989,11 +1007,11 @@ IRBuilder<> Builder(Inst); ShapeInfo &Shape = I->second; - ColumnMatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder); - ColumnMatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder); + MatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder); + MatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder); // Add each column and store the result back into the opmapping - ColumnMatrixTy Result; + MatrixTy Result; auto BuildColumnOp = [&Builder, Inst](Value *LHS, Value *RHS) { switch (Inst->getOpcode()) { case Instruction::Add: @@ -1035,7 +1053,7 @@ /// Mapping from instructions to column matrixes. It is used to identify /// matrix instructions. - const MapVector &Inst2ColumnMatrix; + const MapVector &Inst2ColumnMatrix; /// Mapping from values to the leaves of all expressions that the value is /// part of. @@ -1052,7 +1070,7 @@ SmallPtrSet ReusedExprs; ExprLinearizer(const DataLayout &DL, - const MapVector &Inst2ColumnMatrix, + const MapVector &Inst2ColumnMatrix, const DenseMap> &Shared, const SmallSetVector &ExprsInSubprogram, Value *Leaf) @@ -1296,12 +1314,12 @@ /// that multiple leaves can share sub-expressions. Shared subexpressions /// are explicitly marked as shared(). struct RemarkGenerator { - const MapVector &Inst2ColumnMatrix; + const MapVector &Inst2ColumnMatrix; OptimizationRemarkEmitter &ORE; Function &Func; const DataLayout &DL; - RemarkGenerator(const MapVector &Inst2ColumnMatrix, + RemarkGenerator(const MapVector &Inst2ColumnMatrix, OptimizationRemarkEmitter &ORE, Function &Func) : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), Func(Func), DL(Func.getParent()->getDataLayout()) {} diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -80,10 +80,8 @@ // Clone OldFunc into NewFunc, transforming the old arguments into references to // VMap values. -// void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, - ValueToValueMapTy &VMap, - bool ModuleLevelChanges, + ValueToValueMapTy &VMap, CloneType CT, SmallVectorImpl &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo, ValueMapTypeRemapper *TypeMapper, @@ -101,12 +99,12 @@ NewFunc->copyAttributesFrom(OldFunc); NewFunc->setAttributes(NewAttrs); + RemapFlags RF = + (CT == CloneType::ModuleLevelChanges) ? RF_None : RF_NoModuleLevelChanges; // Fix up the personality function that got copied over. if (OldFunc->hasPersonalityFn()) - NewFunc->setPersonalityFn( - MapValue(OldFunc->getPersonalityFn(), VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper, Materializer)); + NewFunc->setPersonalityFn(MapValue(OldFunc->getPersonalityFn(), VMap, RF, + TypeMapper, Materializer)); SmallVector NewArgAttrs(NewFunc->arg_size()); AttributeList OldAttrs = OldFunc->getAttributes(); @@ -123,11 +121,11 @@ AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(), OldAttrs.getRetAttributes(), NewArgAttrs)); - bool MustCloneSP = - OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent(); + bool MustCloneSP = CT != CloneType::ExtractingFunctions && + OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent(); DISubprogram *SP = OldFunc->getSubprogram(); if (SP) { - assert(!MustCloneSP || ModuleLevelChanges); + assert(!MustCloneSP || CT == CloneType::ModuleLevelChanges); // Add mappings for some DebugInfo nodes that we don't want duplicated // even if they're distinct. auto &MD = VMap.MD(); @@ -144,10 +142,7 @@ OldFunc->getAllMetadata(MDs); for (auto MD : MDs) { NewFunc->addMetadata( - MD.first, - *MapMetadata(MD.second, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper, Materializer)); + MD.first, *MapMetadata(MD.second, VMap, RF, TypeMapper, Materializer)); } // When we remap instructions, we want to avoid duplicating inlined @@ -167,7 +162,7 @@ // Create a new basic block and copy instructions into it! BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo, - ModuleLevelChanges ? &DIFinder : nullptr); + CT == CloneType::ModuleLevelChanges ? &DIFinder : nullptr); // Add basic block mapping. VMap[&BB] = CBB; @@ -207,9 +202,7 @@ BB != BE; ++BB) // Loop over all instructions, fixing each one as we find it... for (Instruction &II : *BB) - RemapInstruction(&II, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper, Materializer); + RemapInstruction(&II, VMap, RF, TypeMapper, Materializer); // Register all DICompileUnits of the old parent module in the new parent module auto* OldModule = OldFunc->getParent(); @@ -262,8 +255,9 @@ } SmallVector Returns; // Ignore returns cloned. - CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "", - CodeInfo); + CloneType CT = F->getSubprogram() != nullptr ? CloneType::ModuleLevelChanges + : CloneType::InvalidCloneType; + CloneFunctionInto(NewF, F, VMap, CT, Returns, "", CodeInfo); return NewF; } diff --git a/llvm/lib/Transforms/Utils/CloneModule.cpp b/llvm/lib/Transforms/Utils/CloneModule.cpp --- a/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -161,7 +161,7 @@ } SmallVector Returns; // Ignore returns cloned. - CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns); + CloneFunctionInto(F, &I, VMap, CloneType::ModuleLevelChanges, Returns); if (I.hasPersonalityFn()) F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap)); diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -24,15 +24,15 @@ ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-DAG: v_lshr_b32_e64 v0, s32, 6 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI-NOT: v_mov ; CI: ds_write_b32 v0, v0 +; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]] ; CI-NEXT: ds_write_b32 v0, v0 ; GFX9: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-NEXT: ds_write_b32 v0, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9-DAG: ds_write_b32 v0, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] ; GFX9-NEXT: ds_write_b32 v0, v0 define void @func_mov_fi_i32_offset() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -115,9 +115,9 @@ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $sgpr28 = S_MOV_B32 8192 - ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr3, 0, implicit $exec + ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr2, 0, implicit $exec ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 @@ -154,9 +154,9 @@ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31 - ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $vcc_lo = S_MOV_B32 8192 - ; CHECK: $vgpr2, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec + ; CHECK: $vgpr2, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -28,8 +28,8 @@ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec + ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -0,0 +1,60 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX9 %s + +# Test case where spilling a VGPR to an emergency slot is needed during frame index elimination. + +--- +name: pei_scavenge_vgpr_spill +tracksRegLiveness: true + +stack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8192 } + - { id: 1, type: default, offset: 0, size: 4, alignment: 8192 } + +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + frameOffsetReg: $sgpr33 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 + + ; GFX8-LABEL: name: pei_scavenge_vgpr_spill + ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 + ; GFX8: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2 + ; GFX8: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc + ; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; GFX8: $vcc_lo = S_MOV_B32 8192 + ; GFX8: $vgpr3, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec + ; GFX8: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX8: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX8: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0 + ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc + ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX8: S_ENDPGM 0, csr_amdgpu_allvgprs + ; GFX9-LABEL: name: pei_scavenge_vgpr_spill + ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 + ; GFX9: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2 + ; GFX9: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc + ; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; GFX9: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec + ; GFX9: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX9: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX9: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0 + ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc + ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs + $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec + S_ENDPGM 0, csr_amdgpu_allvgprs +... diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -246,7 +246,7 @@ ; GFX908-DAG v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 644 +; GFX900: ScratchSize: 708 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -24,7 +24,7 @@ ; OFFREG is offset system SGPR ; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload -; GCN: NumVgprs: 256 +; GCN: NumVgprs: 255 ; GCN: ScratchSize: 1536 define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, [16 x <4 x i32>] addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -0,0 +1,1491 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s + +define arm_aapcs_vfpcc void @test_fadd(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fadd: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vadd.f16 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fadd fast <8 x half> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fadd_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fadd_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vadd.f16 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fadd fast <8 x half> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmul(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fmul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vmul.f16 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmul_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fmul_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vmul.f16 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB3_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fsub(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fsub: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vsub.f16 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB4_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fsub fast <8 x half> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fsub_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fsub_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vsub.f16 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB5_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fsub fast <8 x half> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define arm_aapcs_vfpcc void @test_fmas(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmas: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: bne .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = getelementptr inbounds half, half* %B, i32 %index + %4 = bitcast half* %3 to <8 x half>* + %wide.load12 = load <8 x half>, <8 x half>* %4, align 4 + %5 = fmul fast <8 x half> %wide.load12, %wide.load + %6 = fadd fast <8 x half> %5, %broadcast.splat14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmas_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmas_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: bne .LBB7_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = getelementptr inbounds half, half* %B, i32 %index + %4 = bitcast half* %3 to <8 x half>* + %wide.load12 = load <8 x half>, <8 x half>* %4, align 4 + %5 = fmul fast <8 x half> %wide.load12, %wide.load + %6 = fadd fast <8 x half> %broadcast.splat14, %5 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fma(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fma: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q2, q1, q0 +; CHECK-NEXT: vstrb.8 q2, [r3], #16 +; CHECK-NEXT: bne .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %wide.load, %broadcast.splat13 + %4 = getelementptr inbounds half, half* %B, i32 %index + %5 = bitcast half* %4 to <8 x half>* + %wide.load14 = load <8 x half>, <8 x half>* %5, align 4 + %6 = fadd fast <8 x half> %3, %wide.load14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fma_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fma_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q2, q0, q1 +; CHECK-NEXT: vstrb.8 q2, [r3], #16 +; CHECK-NEXT: bne .LBB9_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %broadcast.splat13, %wide.load + %4 = getelementptr inbounds half, half* %B, i32 %index + %5 = bitcast half* %4 to <8 x half>* + %wide.load14 = load <8 x half>, <8 x half>* %5, align 4 + %6 = fadd fast <8 x half> %3, %wide.load14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmss: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: vneg.f16 q0, q0 +; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: bne .LBB10_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = getelementptr inbounds half, half* %B, i32 %index + %4 = bitcast half* %3 to <8 x half>* + %wide.load12 = load <8 x half>, <8 x half>* %4, align 4 + %5 = fmul fast <8 x half> %wide.load12, %wide.load + %6 = fsub fast <8 x half> %5, %broadcast.splat14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmss_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfms.f16 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: bne .LBB11_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = getelementptr inbounds half, half* %B, i32 %index + %4 = bitcast half* %3 to <8 x half>* + %wide.load12 = load <8 x half>, <8 x half>* %4, align 4 + %5 = fmul fast <8 x half> %wide.load12, %wide.load + %6 = fsub fast <8 x half> %broadcast.splat14, %5 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fms: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vneg.f16 q1, q1 +; CHECK-NEXT: vfma.f16 q1, q2, q0 +; CHECK-NEXT: vstrb.8 q1, [r3], #16 +; CHECK-NEXT: bne .LBB12_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %wide.load, %broadcast.splat13 + %4 = getelementptr inbounds half, half* %B, i32 %index + %5 = bitcast half* %4 to <8 x half>* + %wide.load14 = load <8 x half>, <8 x half>* %5, align 4 + %6 = fsub fast <8 x half> %3, %wide.load14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fms_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vneg.f16 q1, q1 +; CHECK-NEXT: vfma.f16 q1, q0, q2 +; CHECK-NEXT: vstrb.8 q1, [r3], #16 +; CHECK-NEXT: bne .LBB13_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %broadcast.splat13, %wide.load + %4 = getelementptr inbounds half, half* %B, i32 %index + %5 = bitcast half* %4 to <8 x half>* + %wide.load14 = load <8 x half>, <8 x half>* %5, align 4 + %6 = fsub fast <8 x half> %3, %wide.load14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, half *%ina) local_unnamed_addr #0 { +; CHECK-LABEL: test_nested: +; CHECK: @ %bb.0: @ %for.body.us.preheader +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: ldrd lr, r12, [sp, #20] +; CHECK-NEXT: lsl.w r3, r12, #1 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB14_1: @ %for.body.us +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: mov r5, r12 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vdup.16 q0, r4 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: .LBB14_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: adds r6, r0, r4 +; CHECK-NEXT: adds r7, r2, r4 +; CHECK-NEXT: vldrw.u32 q1, [r7] +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: adds r4, #16 +; CHECK-NEXT: subs r5, #8 +; CHECK-NEXT: vfms.f16 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r6] +; CHECK-NEXT: bne .LBB14_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us +; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1 +; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: adds r1, #2 +; CHECK-NEXT: le lr, .LBB14_1 +; CHECK-NEXT: @ %bb.4: @ %for.end14 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +for.body.us.preheader: + %in = load half, half* %ina + %cmp = icmp sgt i32 %numRows, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp1 = icmp sgt i32 %numCols, 0 + tail call void @llvm.assume(i1 %cmp1) + %rem = and i32 %numCols, 7 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %cmp3 = icmp slt i32 %l, %numCols + tail call void @llvm.assume(i1 %cmp3) + br label %for.body.us + +for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader + %pInT1.addr.038.us = phi half* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ] + %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ] + %pOutT1.addr.036.us = phi half* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ] + %pPRT_in.addr.035.us = phi half* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ] + %scevgep = getelementptr half, half* %pPRT_in.addr.035.us, i32 %numCols + %0 = load half, half* %pOutT1.addr.036.us, align 4 + %broadcast.splatinsert47 = insertelement <8 x half> undef, half %0, i32 0 + %broadcast.splat48 = shufflevector <8 x half> %broadcast.splatinsert47, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %for.body.us + %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ] + %next.gep = getelementptr half, half* %pInT1.addr.038.us, i32 %index + %next.gep45 = getelementptr half, half* %pPRT_in.addr.035.us, i32 %index + %1 = bitcast half* %next.gep to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %1, align 4 + %2 = bitcast half* %next.gep45 to <8 x half>* + %wide.load46 = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %wide.load46, %broadcast.splat48 + %4 = fsub fast <8 x half> %wide.load, %3 + store <8 x half> %4, <8 x half>* %1, align 4 + %index.next = add i32 %index, 8 + %5 = icmp eq i32 %index.next, %numCols + br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body + +for.cond6.for.end_crit_edge.us: ; preds = %vector.body + %incdec.ptr.us = getelementptr inbounds half, half* %pOutT1.addr.036.us, i32 1 + %scevgep40 = getelementptr half, half* %pInT1.addr.038.us, i32 %numCols + %inc13.us = add nuw nsw i32 %i.037.us, 1 + %exitcond41 = icmp eq i32 %inc13.us, %numRows + br i1 %exitcond41, label %for.end14, label %for.body.us + +for.end14: ; preds = %for.cond6.for.end_crit_edge.us + ret void +} + +%struct.arm_fir_instance_f32 = type { i16, half*, half* } +define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* %pDst, i32 %blockSize) { +; CHECK-LABEL: arm_fir_f32_1_4_mve: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: ldrh.w r9, [r0] +; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: sub.w r7, r9, #1 +; CHECK-NEXT: cmp r7, #3 +; CHECK-NEXT: bhi .LBB15_6 +; CHECK-NEXT: @ %bb.1: @ %if.then +; CHECK-NEXT: ldr r6, [r0, #8] +; CHECK-NEXT: vldr.16 s0, [r6] +; CHECK-NEXT: vmov lr, s0 +; CHECK-NEXT: vldr.16 s0, [r6, #2] +; CHECK-NEXT: vdup.16 q3, lr +; CHECK-NEXT: lsr.w lr, r3, #2 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vldr.16 s0, [r6, #4] +; CHECK-NEXT: vdup.16 q2, r5 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vldr.16 s0, [r6, #6] +; CHECK-NEXT: vdup.16 q1, r4 +; CHECK-NEXT: add.w r4, r12, r7, lsl #1 +; CHECK-NEXT: vmov r6, s0 +; CHECK-NEXT: vdup.16 q0, r6 +; CHECK-NEXT: wls lr, lr, .LBB15_5 +; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph +; CHECK-NEXT: bic r10, r3, #3 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: add.w r8, r2, r10, lsl #1 +; CHECK-NEXT: .LBB15_3: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r5, r1, r6 +; CHECK-NEXT: vldrw.u32 q4, [r5] +; CHECK-NEXT: adds r5, r4, r6 +; CHECK-NEXT: vstrw.32 q4, [r5] +; CHECK-NEXT: add.w r5, r12, r6 +; CHECK-NEXT: vldrw.u32 q4, [r5] +; CHECK-NEXT: adds r7, r5, #2 +; CHECK-NEXT: vldrw.u32 q5, [r7] +; CHECK-NEXT: vmul.f16 q4, q4, q3 +; CHECK-NEXT: vfma.f16 q4, q5, q2 +; CHECK-NEXT: vldrw.u32 q5, [r5, #4] +; CHECK-NEXT: adds r5, #6 +; CHECK-NEXT: vfma.f16 q4, q5, q1 +; CHECK-NEXT: vldrw.u32 q5, [r5] +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: adds r6, #8 +; CHECK-NEXT: vfma.f16 q4, q5, q0 +; CHECK-NEXT: vstrw.32 q4, [r5] +; CHECK-NEXT: le lr, .LBB15_3 +; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit +; CHECK-NEXT: add r4, r6 +; CHECK-NEXT: add.w r12, r12, r10, lsl #1 +; CHECK-NEXT: add.w r1, r1, r10, lsl #1 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: .LBB15_5: @ %while.end +; CHECK-NEXT: and r7, r3, #3 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vctp.16 r7 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q4, [r4] +; CHECK-NEXT: vldrw.u32 q4, [r12] +; CHECK-NEXT: add.w r1, r12, #2 +; CHECK-NEXT: vmul.f16 q3, q4, q3 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: add.w r1, r12, #6 +; CHECK-NEXT: vfma.f16 q3, q4, q2 +; CHECK-NEXT: vldrw.u32 q2, [r12, #4] +; CHECK-NEXT: vfma.f16 q3, q2, q1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vfma.f16 q3, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q3, [r2] +; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: .LBB15_6: @ %if.end +; CHECK-NEXT: add.w r0, r12, r3, lsl #1 +; CHECK-NEXT: lsr.w lr, r9, #2 +; CHECK-NEXT: wls lr, lr, .LBB15_10 +; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader +; CHECK-NEXT: bic r2, r9, #3 +; CHECK-NEXT: adds r1, r2, r3 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: add.w r1, r12, r1, lsl #1 +; CHECK-NEXT: .LBB15_8: @ %while.body51 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #8 +; CHECK-NEXT: vstrb.8 q0, [r3], #8 +; CHECK-NEXT: le lr, .LBB15_8 +; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit +; CHECK-NEXT: add.w r12, r12, r2, lsl #1 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: .LBB15_10: @ %while.end55 +; CHECK-NEXT: ands r1, r9, #3 +; CHECK-NEXT: beq .LBB15_12 +; CHECK-NEXT: @ %bb.11: @ %if.then59 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vctp.16 r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r12] +; CHECK-NEXT: .LBB15_12: @ %if.end61 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +entry: + %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 + %0 = load half*, half** %pState1, align 4 + %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2 + %1 = load half*, half** %pCoeffs2, align 4 + %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0 + %2 = load i16, i16* %numTaps3, align 4 + %conv = zext i16 %2 to i32 + %sub = add nsw i32 %conv, -1 + %cmp = icmp ult i32 %sub, 4 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds half, half* %0, i32 %sub + %incdec.ptr = getelementptr inbounds half, half* %1, i32 1 + %3 = load half, half* %1, align 4 + %incdec.ptr6 = getelementptr inbounds half, half* %1, i32 2 + %4 = load half, half* %incdec.ptr, align 4 + %incdec.ptr7 = getelementptr inbounds half, half* %1, i32 3 + %5 = load half, half* %incdec.ptr6, align 4 + %6 = load half, half* %incdec.ptr7, align 4 + %shr = lshr i32 %blockSize, 2 + %cmp9146 = icmp eq i32 %shr, 0 + %.pre161 = insertelement <8 x half> undef, half %3, i32 0 + %.pre162 = shufflevector <8 x half> %.pre161, <8 x half> undef, <8 x i32> zeroinitializer + %.pre163 = insertelement <8 x half> undef, half %4, i32 0 + %.pre164 = shufflevector <8 x half> %.pre163, <8 x half> undef, <8 x i32> zeroinitializer + %.pre165 = insertelement <8 x half> undef, half %5, i32 0 + %.pre166 = shufflevector <8 x half> %.pre165, <8 x half> undef, <8 x i32> zeroinitializer + %.pre167 = insertelement <8 x half> undef, half %6, i32 0 + %.pre168 = shufflevector <8 x half> %.pre167, <8 x half> undef, <8 x i32> zeroinitializer + br i1 %cmp9146, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %if.then + %7 = and i32 %blockSize, -4 + %scevgep158 = getelementptr half, half* %pDst, i32 %7 + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %pStateCur.0151 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %pSamples.0150 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ] + %pOutput.0149 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ] + %pTempSrc.0148 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ] + %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ] + %8 = bitcast half* %pTempSrc.0148 to <8 x half>* + %9 = load <8 x half>, <8 x half>* %8, align 4 + %10 = bitcast half* %pStateCur.0151 to <8 x half>* + store <8 x half> %9, <8 x half>* %10, align 4 + %add.ptr = getelementptr inbounds half, half* %pStateCur.0151, i32 4 + %add.ptr11 = getelementptr inbounds half, half* %pTempSrc.0148, i32 4 + %11 = bitcast half* %pSamples.0150 to <8 x half>* + %12 = load <8 x half>, <8 x half>* %11, align 4 + %13 = fmul fast <8 x half> %12, %.pre162 + %arrayidx12 = getelementptr inbounds half, half* %pSamples.0150, i32 1 + %14 = bitcast half* %arrayidx12 to <8 x half>* + %15 = load <8 x half>, <8 x half>* %14, align 4 + %mul = fmul fast <8 x half> %15, %.pre164 + %add = fadd fast <8 x half> %mul, %13 + %arrayidx13 = getelementptr inbounds half, half* %pSamples.0150, i32 2 + %16 = bitcast half* %arrayidx13 to <8 x half>* + %17 = load <8 x half>, <8 x half>* %16, align 4 + %mul16 = fmul fast <8 x half> %17, %.pre166 + %add17 = fadd fast <8 x half> %add, %mul16 + %arrayidx18 = getelementptr inbounds half, half* %pSamples.0150, i32 3 + %18 = bitcast half* %arrayidx18 to <8 x half>* + %19 = load <8 x half>, <8 x half>* %18, align 4 + %mul21 = fmul fast <8 x half> %19, %.pre168 + %add22 = fadd fast <8 x half> %add17, %mul21 + %20 = bitcast half* %pOutput.0149 to <8 x half>* + store <8 x half> %add22, <8 x half>* %20, align 4 + %add.ptr23 = getelementptr inbounds half, half* %pOutput.0149, i32 4 + %add.ptr24 = getelementptr inbounds half, half* %pSamples.0150, i32 4 + %dec = add nsw i32 %blkCnt.0147, -1 + %cmp9 = icmp eq i32 %dec, 0 + br i1 %cmp9, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %scevgep157 = getelementptr half, half* %pSrc, i32 %7 + %scevgep159 = getelementptr half, half* %0, i32 %7 + br label %while.end + +while.end: ; preds = %if.then, %while.end.loopexit + %pTempSrc.0.lcssa = phi half* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ] + %pOutput.0.lcssa = phi half* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ] + %pSamples.0.lcssa = phi half* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ] + %pStateCur.0.lcssa = phi half* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ] + %and = and i32 %blockSize, 3 + %21 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and) + %22 = bitcast half* %pTempSrc.0.lcssa to <8 x half>* + %23 = load <8 x half>, <8 x half>* %22, align 4 + %24 = bitcast half* %pStateCur.0.lcssa to <8 x half>* + tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %23, <8 x half>* %24, i32 4, <8 x i1> %21) + %25 = bitcast half* %pSamples.0.lcssa to <8 x half>* + %26 = load <8 x half>, <8 x half>* %25, align 4 + %27 = fmul fast <8 x half> %26, %.pre162 + %arrayidx29 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 1 + %28 = bitcast half* %arrayidx29 to <8 x half>* + %29 = load <8 x half>, <8 x half>* %28, align 4 + %mul32 = fmul fast <8 x half> %29, %.pre164 + %add33 = fadd fast <8 x half> %mul32, %27 + %arrayidx34 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 2 + %30 = bitcast half* %arrayidx34 to <8 x half>* + %31 = load <8 x half>, <8 x half>* %30, align 4 + %mul37 = fmul fast <8 x half> %31, %.pre166 + %add38 = fadd fast <8 x half> %add33, %mul37 + %arrayidx39 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 3 + %32 = bitcast half* %arrayidx39 to <8 x half>* + %33 = load <8 x half>, <8 x half>* %32, align 4 + %mul42 = fmul fast <8 x half> %33, %.pre168 + %add43 = fadd fast <8 x half> %add38, %mul42 + %34 = bitcast half* %pOutput.0.lcssa to <8 x half>* + tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %add43, <8 x half>* %34, i32 4, <8 x i1> %21) + %.pre = load half*, half** %pState1, align 4 + br label %if.end + +if.end: ; preds = %while.end, %entry + %35 = phi half* [ %.pre, %while.end ], [ %0, %entry ] + %arrayidx45 = getelementptr inbounds half, half* %35, i32 %blockSize + %shr47 = lshr i32 %conv, 2 + %cmp49141 = icmp eq i32 %shr47, 0 + br i1 %cmp49141, label %while.end55, label %while.body51.preheader + +while.body51.preheader: ; preds = %if.end + %36 = and i32 %conv, 65532 + %37 = add i32 %36, %blockSize + %scevgep = getelementptr half, half* %35, i32 %37 + br label %while.body51 + +while.body51: ; preds = %while.body51.preheader, %while.body51 + %pTempSrc.1144 = phi half* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ] + %pTempDest.0143 = phi half* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ] + %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ] + %38 = bitcast half* %pTempSrc.1144 to <8 x half>* + %39 = load <8 x half>, <8 x half>* %38, align 4 + %40 = bitcast half* %pTempDest.0143 to <8 x half>* + store <8 x half> %39, <8 x half>* %40, align 4 + %add.ptr52 = getelementptr inbounds half, half* %pTempSrc.1144, i32 4 + %add.ptr53 = getelementptr inbounds half, half* %pTempDest.0143, i32 4 + %dec54 = add nsw i32 %blkCnt.1142, -1 + %cmp49 = icmp eq i32 %dec54, 0 + br i1 %cmp49, label %while.end55.loopexit, label %while.body51 + +while.end55.loopexit: ; preds = %while.body51 + %scevgep156 = getelementptr half, half* %35, i32 %36 + br label %while.end55 + +while.end55: ; preds = %while.end55.loopexit, %if.end + %pTempDest.0.lcssa = phi half* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ] + %pTempSrc.1.lcssa = phi half* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ] + %and56 = and i32 %conv, 3 + %cmp57 = icmp eq i32 %and56, 0 + br i1 %cmp57, label %if.end61, label %if.then59 + +if.then59: ; preds = %while.end55 + %41 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and56) + %42 = bitcast half* %pTempSrc.1.lcssa to <8 x half>* + %43 = load <8 x half>, <8 x half>* %42, align 4 + %44 = bitcast half* %pTempDest.0.lcssa to <8 x half>* + tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %43, <8 x half>* %44, i32 4, <8 x i1> %41) + br label %if.end61 + +if.end61: ; preds = %while.end55, %if.then59 + ret void +} + + +define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %blockSize) { +; CHECK-LABEL: fir: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: cmp r3, #8 +; CHECK-NEXT: blo.w .LBB16_12 +; CHECK-NEXT: @ %bb.1: @ %if.then +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: cmp.w r7, r3, lsr #2 +; CHECK-NEXT: beq.w .LBB16_12 +; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph +; CHECK-NEXT: ldrh.w r11, [r0] +; CHECK-NEXT: mov.w r8, #1 +; CHECK-NEXT: ldrd r5, r12, [r0, #4] +; CHECK-NEXT: lsrs r3, r3, #2 +; CHECK-NEXT: sub.w r0, r11, #8 +; CHECK-NEXT: and r10, r0, #7 +; CHECK-NEXT: add.w r7, r0, r0, lsr #29 +; CHECK-NEXT: add.w r0, r10, #1 +; CHECK-NEXT: asrs r6, r7, #3 +; CHECK-NEXT: cmp r6, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: asrgt.w r8, r7, #3 +; CHECK-NEXT: add.w r7, r5, r11, lsl #1 +; CHECK-NEXT: subs r4, r7, #2 +; CHECK-NEXT: rsb.w r7, r11, #0 +; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r7, r12, #16 +; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: b .LBB16_4 +; CHECK-NEXT: .LBB16_3: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: subs r3, #1 +; CHECK-NEXT: vstrb.8 q0, [r2], #8 +; CHECK-NEXT: add.w r0, r9, r0, lsl #1 +; CHECK-NEXT: add.w r5, r0, #8 +; CHECK-NEXT: beq.w .LBB16_12 +; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 +; CHECK-NEXT: vldrw.u32 q0, [r1], #8 +; CHECK-NEXT: vldr.16 s7, [r12] +; CHECK-NEXT: vldr.16 s4, [r12, #14] +; CHECK-NEXT: vldr.16 s6, [r12, #12] +; CHECK-NEXT: vldr.16 s8, [r12, #10] +; CHECK-NEXT: vldr.16 s10, [r12, #8] +; CHECK-NEXT: vldr.16 s12, [r12, #6] +; CHECK-NEXT: vldr.16 s14, [r12, #4] +; CHECK-NEXT: vldr.16 s5, [r12, #2] +; CHECK-NEXT: vstrb.8 q0, [r4], #8 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: adds r6, r5, #2 +; CHECK-NEXT: add.w r9, r5, #16 +; CHECK-NEXT: vmul.f16 q0, q0, r0 +; CHECK-NEXT: vldrw.u32 q4, [r6] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: adds r6, r5, #6 +; CHECK-NEXT: vfma.f16 q0, q4, r0 +; CHECK-NEXT: vldrw.u32 q4, [r5, #4] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: cmp.w r11, #16 +; CHECK-NEXT: vfma.f16 q0, q4, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vldrw.u32 q3, [r6] +; CHECK-NEXT: add.w r6, r5, #10 +; CHECK-NEXT: vfma.f16 q0, q3, r0 +; CHECK-NEXT: vldrw.u32 q3, [r5, #8] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vfma.f16 q0, q3, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: add.w r6, r5, #14 +; CHECK-NEXT: vfma.f16 q0, q2, r0 +; CHECK-NEXT: vldrw.u32 q2, [r5, #12] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vfma.f16 q0, q2, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [r6] +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: blo .LBB16_8 +; CHECK-NEXT: @ %bb.5: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: dls lr, r8 +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: .LBB16_6: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldr.16 s4, [r6] +; CHECK-NEXT: add.w r5, r9, #2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [r9] +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldr.16 s4, [r6, #2] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: add.w r5, r9, #6 +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldr.16 s4, [r6, #4] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [r9, #4] +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldr.16 s4, [r6, #6] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: add.w r5, r9, #10 +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldr.16 s4, [r6, #8] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [r9, #8] +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldr.16 s4, [r6, #10] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: add.w r5, r9, #14 +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldr.16 s4, [r6, #12] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [r9, #12] +; CHECK-NEXT: add.w r9, r9, #16 +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldr.16 s4, [r6, #14] +; CHECK-NEXT: adds r6, #16 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: le lr, .LBB16_6 +; CHECK-NEXT: @ %bb.7: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: bne .LBB16_9 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: beq.w .LBB16_3 +; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: .LBB16_10: @ %while.body76 +; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldr.16 s4, [r6] +; CHECK-NEXT: subs r0, #1 +; CHECK-NEXT: adds r6, #2 +; CHECK-NEXT: cmp r0, #1 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vldrh.u16 q1, [r5], #2 +; CHECK-NEXT: vfma.f16 q0, q1, r7 +; CHECK-NEXT: bgt .LBB16_10 +; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: add.w r9, r9, r10, lsl #1 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_12: @ %if.end +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +entry: + %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 + %0 = load half*, half** %pState1, align 4 + %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2 + %1 = load half*, half** %pCoeffs2, align 4 + %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0 + %2 = load i16, i16* %numTaps3, align 4 + %conv = zext i16 %2 to i32 + %cmp = icmp ugt i32 %blockSize, 7 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %shr = lshr i32 %blockSize, 2 + %cmp5217 = icmp eq i32 %shr, 0 + br i1 %cmp5217, label %if.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %if.then + %sub = add nsw i32 %conv, -1 + %arrayidx = getelementptr inbounds half, half* %0, i32 %sub + %incdec.ptr = getelementptr inbounds half, half* %1, i32 1 + %incdec.ptr7 = getelementptr inbounds half, half* %1, i32 2 + %incdec.ptr8 = getelementptr inbounds half, half* %1, i32 3 + %incdec.ptr9 = getelementptr inbounds half, half* %1, i32 4 + %incdec.ptr10 = getelementptr inbounds half, half* %1, i32 5 + %incdec.ptr11 = getelementptr inbounds half, half* %1, i32 6 + %incdec.ptr12 = getelementptr inbounds half, half* %1, i32 7 + %sub37 = add nsw i32 %conv, -8 + %div = sdiv i32 %sub37, 8 + %pCoeffsCur.0199 = getelementptr inbounds half, half* %1, i32 8 + %cmp38201 = icmp ugt i16 %2, 15 + %and = and i32 %sub37, 7 + %cmp74210 = icmp eq i32 %and, 0 + %idx.neg = sub nsw i32 0, %conv + %3 = icmp sgt i32 %div, 1 + %smax = select i1 %3, i32 %div, i32 1 + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.end + %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ] + %pStateCur.0221 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ] + %pSamples.0220 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ] + %pTempSrc.0219 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ] + %pOutput.0218 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ] + %4 = load half, half* %1, align 4 + %5 = load half, half* %incdec.ptr, align 4 + %6 = load half, half* %incdec.ptr7, align 4 + %7 = load half, half* %incdec.ptr8, align 4 + %8 = load half, half* %incdec.ptr9, align 4 + %9 = load half, half* %incdec.ptr10, align 4 + %10 = load half, half* %incdec.ptr11, align 4 + %11 = load half, half* %incdec.ptr12, align 4 + %12 = bitcast half* %pTempSrc.0219 to <8 x half>* + %13 = load <8 x half>, <8 x half>* %12, align 4 + %14 = bitcast half* %pStateCur.0221 to <8 x half>* + store <8 x half> %13, <8 x half>* %14, align 4 + %add.ptr = getelementptr inbounds half, half* %pStateCur.0221, i32 4 + %add.ptr14 = getelementptr inbounds half, half* %pTempSrc.0219, i32 4 + %15 = bitcast half* %pSamples.0220 to <8 x half>* + %16 = load <8 x half>, <8 x half>* %15, align 4 + %.splatinsert = insertelement <8 x half> undef, half %4, i32 0 + %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + %17 = fmul fast <8 x half> %16, %.splat + %arrayidx15 = getelementptr inbounds half, half* %pSamples.0220, i32 1 + %18 = bitcast half* %arrayidx15 to <8 x half>* + %19 = load <8 x half>, <8 x half>* %18, align 4 + %.splatinsert16 = insertelement <8 x half> undef, half %5, i32 0 + %.splat17 = shufflevector <8 x half> %.splatinsert16, <8 x half> undef, <8 x i32> zeroinitializer + %20 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %19, <8 x half> %.splat17, <8 x half> %17) + %arrayidx18 = getelementptr inbounds half, half* %pSamples.0220, i32 2 + %21 = bitcast half* %arrayidx18 to <8 x half>* + %22 = load <8 x half>, <8 x half>* %21, align 4 + %.splatinsert19 = insertelement <8 x half> undef, half %6, i32 0 + %.splat20 = shufflevector <8 x half> %.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer + %23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %22, <8 x half> %.splat20, <8 x half> %20) + %arrayidx21 = getelementptr inbounds half, half* %pSamples.0220, i32 3 + %24 = bitcast half* %arrayidx21 to <8 x half>* + %25 = load <8 x half>, <8 x half>* %24, align 4 + %.splatinsert22 = insertelement <8 x half> undef, half %7, i32 0 + %.splat23 = shufflevector <8 x half> %.splatinsert22, <8 x half> undef, <8 x i32> zeroinitializer + %26 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %25, <8 x half> %.splat23, <8 x half> %23) + %arrayidx24 = getelementptr inbounds half, half* %pSamples.0220, i32 4 + %27 = bitcast half* %arrayidx24 to <8 x half>* + %28 = load <8 x half>, <8 x half>* %27, align 4 + %.splatinsert25 = insertelement <8 x half> undef, half %8, i32 0 + %.splat26 = shufflevector <8 x half> %.splatinsert25, <8 x half> undef, <8 x i32> zeroinitializer + %29 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %28, <8 x half> %.splat26, <8 x half> %26) + %arrayidx27 = getelementptr inbounds half, half* %pSamples.0220, i32 5 + %30 = bitcast half* %arrayidx27 to <8 x half>* + %31 = load <8 x half>, <8 x half>* %30, align 4 + %.splatinsert28 = insertelement <8 x half> undef, half %9, i32 0 + %.splat29 = shufflevector <8 x half> %.splatinsert28, <8 x half> undef, <8 x i32> zeroinitializer + %32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %31, <8 x half> %.splat29, <8 x half> %29) + %arrayidx30 = getelementptr inbounds half, half* %pSamples.0220, i32 6 + %33 = bitcast half* %arrayidx30 to <8 x half>* + %34 = load <8 x half>, <8 x half>* %33, align 4 + %.splatinsert31 = insertelement <8 x half> undef, half %10, i32 0 + %.splat32 = shufflevector <8 x half> %.splatinsert31, <8 x half> undef, <8 x i32> zeroinitializer + %35 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %34, <8 x half> %.splat32, <8 x half> %32) + %arrayidx33 = getelementptr inbounds half, half* %pSamples.0220, i32 7 + %36 = bitcast half* %arrayidx33 to <8 x half>* + %37 = load <8 x half>, <8 x half>* %36, align 4 + %.splatinsert34 = insertelement <8 x half> undef, half %11, i32 0 + %.splat35 = shufflevector <8 x half> %.splatinsert34, <8 x half> undef, <8 x i32> zeroinitializer + %38 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %37, <8 x half> %.splat35, <8 x half> %35) + %pSamples.1200 = getelementptr inbounds half, half* %pSamples.0220, i32 8 + br i1 %cmp38201, label %for.body, label %for.end + +for.body: ; preds = %while.body, %for.body + %pSamples.1207 = phi half* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ] + %pCoeffsCur.0206 = phi half* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ] + %.pn205 = phi half* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ] + %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ] + %vecAcc0.0203 = phi <8 x half> [ %70, %for.body ], [ %38, %while.body ] + %pSamples.0.pn202 = phi half* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ] + %incdec.ptr40 = getelementptr inbounds half, half* %.pn205, i32 9 + %39 = load half, half* %pCoeffsCur.0206, align 4 + %incdec.ptr41 = getelementptr inbounds half, half* %.pn205, i32 10 + %40 = load half, half* %incdec.ptr40, align 4 + %incdec.ptr42 = getelementptr inbounds half, half* %.pn205, i32 11 + %41 = load half, half* %incdec.ptr41, align 4 + %incdec.ptr43 = getelementptr inbounds half, half* %.pn205, i32 12 + %42 = load half, half* %incdec.ptr42, align 4 + %incdec.ptr44 = getelementptr inbounds half, half* %.pn205, i32 13 + %43 = load half, half* %incdec.ptr43, align 4 + %incdec.ptr45 = getelementptr inbounds half, half* %.pn205, i32 14 + %44 = load half, half* %incdec.ptr44, align 4 + %incdec.ptr46 = getelementptr inbounds half, half* %.pn205, i32 15 + %45 = load half, half* %incdec.ptr45, align 4 + %46 = load half, half* %incdec.ptr46, align 4 + %47 = bitcast half* %pSamples.1207 to <8 x half>* + %48 = load <8 x half>, <8 x half>* %47, align 4 + %.splatinsert48 = insertelement <8 x half> undef, half %39, i32 0 + %.splat49 = shufflevector <8 x half> %.splatinsert48, <8 x half> undef, <8 x i32> zeroinitializer + %49 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %48, <8 x half> %.splat49, <8 x half> %vecAcc0.0203) + %arrayidx50 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 9 + %50 = bitcast half* %arrayidx50 to <8 x half>* + %51 = load <8 x half>, <8 x half>* %50, align 4 + %.splatinsert51 = insertelement <8 x half> undef, half %40, i32 0 + %.splat52 = shufflevector <8 x half> %.splatinsert51, <8 x half> undef, <8 x i32> zeroinitializer + %52 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %51, <8 x half> %.splat52, <8 x half> %49) + %arrayidx53 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 10 + %53 = bitcast half* %arrayidx53 to <8 x half>* + %54 = load <8 x half>, <8 x half>* %53, align 4 + %.splatinsert54 = insertelement <8 x half> undef, half %41, i32 0 + %.splat55 = shufflevector <8 x half> %.splatinsert54, <8 x half> undef, <8 x i32> zeroinitializer + %55 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %54, <8 x half> %.splat55, <8 x half> %52) + %arrayidx56 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 11 + %56 = bitcast half* %arrayidx56 to <8 x half>* + %57 = load <8 x half>, <8 x half>* %56, align 4 + %.splatinsert57 = insertelement <8 x half> undef, half %42, i32 0 + %.splat58 = shufflevector <8 x half> %.splatinsert57, <8 x half> undef, <8 x i32> zeroinitializer + %58 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %57, <8 x half> %.splat58, <8 x half> %55) + %arrayidx59 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 12 + %59 = bitcast half* %arrayidx59 to <8 x half>* + %60 = load <8 x half>, <8 x half>* %59, align 4 + %.splatinsert60 = insertelement <8 x half> undef, half %43, i32 0 + %.splat61 = shufflevector <8 x half> %.splatinsert60, <8 x half> undef, <8 x i32> zeroinitializer + %61 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %60, <8 x half> %.splat61, <8 x half> %58) + %arrayidx62 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 13 + %62 = bitcast half* %arrayidx62 to <8 x half>* + %63 = load <8 x half>, <8 x half>* %62, align 4 + %.splatinsert63 = insertelement <8 x half> undef, half %44, i32 0 + %.splat64 = shufflevector <8 x half> %.splatinsert63, <8 x half> undef, <8 x i32> zeroinitializer + %64 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %63, <8 x half> %.splat64, <8 x half> %61) + %arrayidx65 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 14 + %65 = bitcast half* %arrayidx65 to <8 x half>* + %66 = load <8 x half>, <8 x half>* %65, align 4 + %.splatinsert66 = insertelement <8 x half> undef, half %45, i32 0 + %.splat67 = shufflevector <8 x half> %.splatinsert66, <8 x half> undef, <8 x i32> zeroinitializer + %67 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %66, <8 x half> %.splat67, <8 x half> %64) + %arrayidx68 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 15 + %68 = bitcast half* %arrayidx68 to <8 x half>* + %69 = load <8 x half>, <8 x half>* %68, align 4 + %.splatinsert69 = insertelement <8 x half> undef, half %46, i32 0 + %.splat70 = shufflevector <8 x half> %.splatinsert69, <8 x half> undef, <8 x i32> zeroinitializer + %70 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %69, <8 x half> %.splat70, <8 x half> %67) + %inc = add nuw nsw i32 %i.0204, 1 + %pCoeffsCur.0 = getelementptr inbounds half, half* %pCoeffsCur.0206, i32 8 + %pSamples.1 = getelementptr inbounds half, half* %pSamples.1207, i32 8 + %exitcond = icmp eq i32 %inc, %smax + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %while.body + %vecAcc0.0.lcssa = phi <8 x half> [ %38, %while.body ], [ %70, %for.body ] + %pCoeffsCur.0.lcssa = phi half* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ] + %pSamples.1.lcssa = phi half* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ] + br i1 %cmp74210, label %while.end, label %while.body76 + +while.body76: ; preds = %for.end, %while.body76 + %pCoeffsCur.1214 = phi half* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ] + %vecAcc0.1213 = phi <8 x half> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ] + %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ] + %pSamples.2211 = phi half* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ] + %incdec.ptr77 = getelementptr inbounds half, half* %pCoeffsCur.1214, i32 1 + %71 = load half, half* %pCoeffsCur.1214, align 4 + %72 = bitcast half* %pSamples.2211 to <8 x half>* + %73 = load <8 x half>, <8 x half>* %72, align 4 + %.splatinsert78 = insertelement <8 x half> undef, half %71, i32 0 + %.splat79 = shufflevector <8 x half> %.splatinsert78, <8 x half> undef, <8 x i32> zeroinitializer + %74 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %73, <8 x half> %.splat79, <8 x half> %vecAcc0.1213) + %incdec.ptr80 = getelementptr inbounds half, half* %pSamples.2211, i32 1 + %dec = add nsw i32 %numCnt.0212, -1 + %cmp74 = icmp sgt i32 %numCnt.0212, 1 + br i1 %cmp74, label %while.body76, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body76 + %scevgep = getelementptr half, half* %pSamples.1.lcssa, i32 %and + br label %while.end + +while.end: ; preds = %while.end.loopexit, %for.end + %pSamples.2.lcssa = phi half* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ] + %vecAcc0.1.lcssa = phi <8 x half> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ] + %75 = bitcast half* %pOutput.0218 to <8 x half>* + store <8 x half> %vecAcc0.1.lcssa, <8 x half>* %75, align 4 + %add.ptr81 = getelementptr inbounds half, half* %pOutput.0218, i32 4 + %add.ptr82 = getelementptr inbounds half, half* %pSamples.2.lcssa, i32 4 + %add.ptr83 = getelementptr inbounds half, half* %add.ptr82, i32 %idx.neg + %dec84 = add nsw i32 %blkCnt.0222, -1 + %cmp5 = icmp eq i32 %dec84, 0 + br i1 %cmp5, label %if.end, label %while.body + +if.end: ; preds = %while.end, %if.then, %entry + ret void +} + +declare void @llvm.assume(i1) +declare <8 x i1> @llvm.arm.mve.vctp16(i32) +declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) +declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -0,0 +1,1456 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s + +define arm_aapcs_vfpcc void @test_fadd(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fadd: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.f32 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fadd fast <4 x float> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fadd_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fadd_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.f32 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fadd fast <4 x float> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmul(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fmul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmul_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fmul_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.f32 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB3_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fsub(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fsub: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vsub.f32 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB4_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fsub fast <4 x float> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fsub_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fsub_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vsub.f32 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB5_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fsub fast <4 x float> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define arm_aapcs_vfpcc void @test_fmas(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmas: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: bne .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = getelementptr inbounds float, float* %B, i32 %index + %4 = bitcast float* %3 to <4 x float>* + %wide.load12 = load <4 x float>, <4 x float>* %4, align 4 + %5 = fmul fast <4 x float> %wide.load12, %wide.load + %6 = fadd fast <4 x float> %5, %broadcast.splat14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmas_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmas_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: bne .LBB7_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = getelementptr inbounds float, float* %B, i32 %index + %4 = bitcast float* %3 to <4 x float>* + %wide.load12 = load <4 x float>, <4 x float>* %4, align 4 + %5 = fmul fast <4 x float> %wide.load12, %wide.load + %6 = fadd fast <4 x float> %broadcast.splat14, %5 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fma(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fma: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: bne .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %wide.load, %broadcast.splat13 + %4 = getelementptr inbounds float, float* %B, i32 %index + %5 = bitcast float* %4 to <4 x float>* + %wide.load14 = load <4 x float>, <4 x float>* %5, align 4 + %6 = fadd fast <4 x float> %3, %wide.load14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fma_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fma_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q2, q0, q1 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: bne .LBB9_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %broadcast.splat13, %wide.load + %4 = getelementptr inbounds float, float* %B, i32 %index + %5 = bitcast float* %4 to <4 x float>* + %wide.load14 = load <4 x float>, <4 x float>* %5, align 4 + %6 = fadd fast <4 x float> %3, %wide.load14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define arm_aapcs_vfpcc void @test_fmss(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmss: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: bne .LBB10_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = getelementptr inbounds float, float* %B, i32 %index + %4 = bitcast float* %3 to <4 x float>* + %wide.load12 = load <4 x float>, <4 x float>* %4, align 4 + %5 = fmul fast <4 x float> %wide.load12, %wide.load + %6 = fsub fast <4 x float> %5, %broadcast.splat14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmss_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmss_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfms.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: bne .LBB11_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = getelementptr inbounds float, float* %B, i32 %index + %4 = bitcast float* %3 to <4 x float>* + %wide.load12 = load <4 x float>, <4 x float>* %4, align 4 + %5 = fmul fast <4 x float> %wide.load12, %wide.load + %6 = fsub fast <4 x float> %broadcast.splat14, %5 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fms(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fms: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q2, q0 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB12_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %wide.load, %broadcast.splat13 + %4 = getelementptr inbounds float, float* %B, i32 %index + %5 = bitcast float* %4 to <4 x float>* + %wide.load14 = load <4 x float>, <4 x float>* %5, align 4 + %6 = fsub fast <4 x float> %3, %wide.load14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fms_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fms_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, q2 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB13_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %broadcast.splat13, %wide.load + %4 = getelementptr inbounds float, float* %B, i32 %index + %5 = bitcast float* %4 to <4 x float>* + %wide.load14 = load <4 x float>, <4 x float>* %5, align 4 + %6 = fsub fast <4 x float> %3, %wide.load14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noalias nocapture readonly %pOutT1, float* noalias nocapture readonly %pPRT_in, float* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, float %in) local_unnamed_addr #0 { +; CHECK-LABEL: test_nested: +; CHECK: @ %bb.0: @ %for.body.us.preheader +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: ldrd lr, r12, [sp, #20] +; CHECK-NEXT: lsl.w r3, r12, #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB14_1: @ %for.body.us +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 +; CHECK-NEXT: vldr s0, [r1] +; CHECK-NEXT: mov r5, r12 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vdup.32 q0, r4 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: .LBB14_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: adds r6, r0, r4 +; CHECK-NEXT: adds r7, r2, r4 +; CHECK-NEXT: vldrw.u32 q1, [r7] +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: adds r4, #16 +; CHECK-NEXT: subs r5, #4 +; CHECK-NEXT: vfms.f32 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r6] +; CHECK-NEXT: bne .LBB14_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us +; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1 +; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: le lr, .LBB14_1 +; CHECK-NEXT: @ %bb.4: @ %for.end14 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +for.body.us.preheader: + %cmp = icmp sgt i32 %numRows, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp1 = icmp sgt i32 %numCols, 0 + tail call void @llvm.assume(i1 %cmp1) + %rem = and i32 %numCols, 7 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %cmp3 = icmp slt i32 %l, %numCols + tail call void @llvm.assume(i1 %cmp3) + br label %for.body.us + +for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader + %pInT1.addr.038.us = phi float* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ] + %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ] + %pOutT1.addr.036.us = phi float* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ] + %pPRT_in.addr.035.us = phi float* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ] + %scevgep = getelementptr float, float* %pPRT_in.addr.035.us, i32 %numCols + %0 = load float, float* %pOutT1.addr.036.us, align 4 + %broadcast.splatinsert47 = insertelement <4 x float> undef, float %0, i32 0 + %broadcast.splat48 = shufflevector <4 x float> %broadcast.splatinsert47, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %for.body.us + %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ] + %next.gep = getelementptr float, float* %pInT1.addr.038.us, i32 %index + %next.gep45 = getelementptr float, float* %pPRT_in.addr.035.us, i32 %index + %1 = bitcast float* %next.gep to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %1, align 4 + %2 = bitcast float* %next.gep45 to <4 x float>* + %wide.load46 = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %wide.load46, %broadcast.splat48 + %4 = fsub fast <4 x float> %wide.load, %3 + store <4 x float> %4, <4 x float>* %1, align 4 + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, %numCols + br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body + +for.cond6.for.end_crit_edge.us: ; preds = %vector.body + %incdec.ptr.us = getelementptr inbounds float, float* %pOutT1.addr.036.us, i32 1 + %scevgep40 = getelementptr float, float* %pInT1.addr.038.us, i32 %numCols + %inc13.us = add nuw nsw i32 %i.037.us, 1 + %exitcond41 = icmp eq i32 %inc13.us, %numRows + br i1 %exitcond41, label %for.end14, label %for.body.us + +for.end14: ; preds = %for.cond6.for.end_crit_edge.us + ret void +} + +%struct.arm_fir_instance_f32 = type { i16, float*, float* } +define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* %pDst, i32 %blockSize) { +; CHECK-LABEL: arm_fir_f32_1_4_mve: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: ldrh.w r10, [r0] +; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: sub.w r7, r10, #1 +; CHECK-NEXT: cmp r7, #3 +; CHECK-NEXT: bhi .LBB15_6 +; CHECK-NEXT: @ %bb.1: @ %if.then +; CHECK-NEXT: ldr r6, [r0, #8] +; CHECK-NEXT: add.w r4, r12, r7, lsl #2 +; CHECK-NEXT: lsr.w lr, r3, #2 +; CHECK-NEXT: vldr s0, [r6, #12] +; CHECK-NEXT: vldr s4, [r6, #8] +; CHECK-NEXT: vmov r7, s0 +; CHECK-NEXT: vldr s8, [r6, #4] +; CHECK-NEXT: vdup.32 q0, r7 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vldr s12, [r6] +; CHECK-NEXT: vdup.32 q1, r7 +; CHECK-NEXT: vmov r7, s8 +; CHECK-NEXT: vdup.32 q2, r7 +; CHECK-NEXT: vmov r7, s12 +; CHECK-NEXT: vdup.32 q3, r7 +; CHECK-NEXT: wls lr, lr, .LBB15_5 +; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph +; CHECK-NEXT: bic r9, r3, #3 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: add.w r8, r2, r9, lsl #2 +; CHECK-NEXT: .LBB15_3: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r5, r1, r6 +; CHECK-NEXT: adds r7, r2, r6 +; CHECK-NEXT: vldrw.u32 q4, [r5] +; CHECK-NEXT: adds r5, r4, r6 +; CHECK-NEXT: vstrw.32 q4, [r5] +; CHECK-NEXT: add.w r5, r12, r6 +; CHECK-NEXT: vldrw.u32 q4, [r5] +; CHECK-NEXT: vldrw.u32 q5, [r5, #4] +; CHECK-NEXT: vldrw.u32 q6, [r5, #12] +; CHECK-NEXT: adds r6, #16 +; CHECK-NEXT: vmul.f32 q4, q4, q3 +; CHECK-NEXT: vfma.f32 q4, q5, q2 +; CHECK-NEXT: vldrw.u32 q5, [r5, #8] +; CHECK-NEXT: vfma.f32 q4, q5, q1 +; CHECK-NEXT: vfma.f32 q4, q6, q0 +; CHECK-NEXT: vstrw.32 q4, [r7] +; CHECK-NEXT: le lr, .LBB15_3 +; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit +; CHECK-NEXT: add r4, r6 +; CHECK-NEXT: add.w r12, r12, r9, lsl #2 +; CHECK-NEXT: add.w r1, r1, r9, lsl #2 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: .LBB15_5: @ %while.end +; CHECK-NEXT: and r7, r3, #3 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vctp.32 r7 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q4, [r4] +; CHECK-NEXT: vldrw.u32 q4, [r12] +; CHECK-NEXT: vmul.f32 q3, q4, q3 +; CHECK-NEXT: vldrw.u32 q4, [r12, #4] +; CHECK-NEXT: vfma.f32 q3, q4, q2 +; CHECK-NEXT: vldrw.u32 q2, [r12, #8] +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vldrw.u32 q1, [r12, #12] +; CHECK-NEXT: vfma.f32 q3, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q3, [r2] +; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: .LBB15_6: @ %if.end +; CHECK-NEXT: add.w r0, r12, r3, lsl #2 +; CHECK-NEXT: lsr.w lr, r10, #2 +; CHECK-NEXT: wls lr, lr, .LBB15_10 +; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader +; CHECK-NEXT: bic r2, r10, #3 +; CHECK-NEXT: adds r1, r2, r3 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: add.w r1, r12, r1, lsl #2 +; CHECK-NEXT: .LBB15_8: @ %while.body51 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vstrb.8 q0, [r3], #16 +; CHECK-NEXT: le lr, .LBB15_8 +; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit +; CHECK-NEXT: add.w r12, r12, r2, lsl #2 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: .LBB15_10: @ %while.end55 +; CHECK-NEXT: ands r1, r10, #3 +; CHECK-NEXT: beq .LBB15_12 +; CHECK-NEXT: @ %bb.11: @ %if.then59 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vctp.32 r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [r12] +; CHECK-NEXT: .LBB15_12: @ %if.end61 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +entry: + %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 + %0 = load float*, float** %pState1, align 4 + %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2 + %1 = load float*, float** %pCoeffs2, align 4 + %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0 + %2 = load i16, i16* %numTaps3, align 4 + %conv = zext i16 %2 to i32 + %sub = add nsw i32 %conv, -1 + %cmp = icmp ult i32 %sub, 4 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds float, float* %0, i32 %sub + %incdec.ptr = getelementptr inbounds float, float* %1, i32 1 + %3 = load float, float* %1, align 4 + %incdec.ptr6 = getelementptr inbounds float, float* %1, i32 2 + %4 = load float, float* %incdec.ptr, align 4 + %incdec.ptr7 = getelementptr inbounds float, float* %1, i32 3 + %5 = load float, float* %incdec.ptr6, align 4 + %6 = load float, float* %incdec.ptr7, align 4 + %shr = lshr i32 %blockSize, 2 + %cmp9146 = icmp eq i32 %shr, 0 + %.pre161 = insertelement <4 x float> undef, float %3, i32 0 + %.pre162 = shufflevector <4 x float> %.pre161, <4 x float> undef, <4 x i32> zeroinitializer + %.pre163 = insertelement <4 x float> undef, float %4, i32 0 + %.pre164 = shufflevector <4 x float> %.pre163, <4 x float> undef, <4 x i32> zeroinitializer + %.pre165 = insertelement <4 x float> undef, float %5, i32 0 + %.pre166 = shufflevector <4 x float> %.pre165, <4 x float> undef, <4 x i32> zeroinitializer + %.pre167 = insertelement <4 x float> undef, float %6, i32 0 + %.pre168 = shufflevector <4 x float> %.pre167, <4 x float> undef, <4 x i32> zeroinitializer + br i1 %cmp9146, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %if.then + %7 = and i32 %blockSize, -4 + %scevgep158 = getelementptr float, float* %pDst, i32 %7 + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %pStateCur.0151 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %pSamples.0150 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ] + %pOutput.0149 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ] + %pTempSrc.0148 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ] + %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ] + %8 = bitcast float* %pTempSrc.0148 to <4 x float>* + %9 = load <4 x float>, <4 x float>* %8, align 4 + %10 = bitcast float* %pStateCur.0151 to <4 x float>* + store <4 x float> %9, <4 x float>* %10, align 4 + %add.ptr = getelementptr inbounds float, float* %pStateCur.0151, i32 4 + %add.ptr11 = getelementptr inbounds float, float* %pTempSrc.0148, i32 4 + %11 = bitcast float* %pSamples.0150 to <4 x float>* + %12 = load <4 x float>, <4 x float>* %11, align 4 + %13 = fmul fast <4 x float> %12, %.pre162 + %arrayidx12 = getelementptr inbounds float, float* %pSamples.0150, i32 1 + %14 = bitcast float* %arrayidx12 to <4 x float>* + %15 = load <4 x float>, <4 x float>* %14, align 4 + %mul = fmul fast <4 x float> %15, %.pre164 + %add = fadd fast <4 x float> %mul, %13 + %arrayidx13 = getelementptr inbounds float, float* %pSamples.0150, i32 2 + %16 = bitcast float* %arrayidx13 to <4 x float>* + %17 = load <4 x float>, <4 x float>* %16, align 4 + %mul16 = fmul fast <4 x float> %17, %.pre166 + %add17 = fadd fast <4 x float> %add, %mul16 + %arrayidx18 = getelementptr inbounds float, float* %pSamples.0150, i32 3 + %18 = bitcast float* %arrayidx18 to <4 x float>* + %19 = load <4 x float>, <4 x float>* %18, align 4 + %mul21 = fmul fast <4 x float> %19, %.pre168 + %add22 = fadd fast <4 x float> %add17, %mul21 + %20 = bitcast float* %pOutput.0149 to <4 x float>* + store <4 x float> %add22, <4 x float>* %20, align 4 + %add.ptr23 = getelementptr inbounds float, float* %pOutput.0149, i32 4 + %add.ptr24 = getelementptr inbounds float, float* %pSamples.0150, i32 4 + %dec = add nsw i32 %blkCnt.0147, -1 + %cmp9 = icmp eq i32 %dec, 0 + br i1 %cmp9, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %scevgep157 = getelementptr float, float* %pSrc, i32 %7 + %scevgep159 = getelementptr float, float* %0, i32 %7 + br label %while.end + +while.end: ; preds = %if.then, %while.end.loopexit + %pTempSrc.0.lcssa = phi float* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ] + %pOutput.0.lcssa = phi float* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ] + %pSamples.0.lcssa = phi float* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ] + %pStateCur.0.lcssa = phi float* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ] + %and = and i32 %blockSize, 3 + %21 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and) + %22 = bitcast float* %pTempSrc.0.lcssa to <4 x float>* + %23 = load <4 x float>, <4 x float>* %22, align 4 + %24 = bitcast float* %pStateCur.0.lcssa to <4 x float>* + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %23, <4 x float>* %24, i32 4, <4 x i1> %21) + %25 = bitcast float* %pSamples.0.lcssa to <4 x float>* + %26 = load <4 x float>, <4 x float>* %25, align 4 + %27 = fmul fast <4 x float> %26, %.pre162 + %arrayidx29 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 1 + %28 = bitcast float* %arrayidx29 to <4 x float>* + %29 = load <4 x float>, <4 x float>* %28, align 4 + %mul32 = fmul fast <4 x float> %29, %.pre164 + %add33 = fadd fast <4 x float> %mul32, %27 + %arrayidx34 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 2 + %30 = bitcast float* %arrayidx34 to <4 x float>* + %31 = load <4 x float>, <4 x float>* %30, align 4 + %mul37 = fmul fast <4 x float> %31, %.pre166 + %add38 = fadd fast <4 x float> %add33, %mul37 + %arrayidx39 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 3 + %32 = bitcast float* %arrayidx39 to <4 x float>* + %33 = load <4 x float>, <4 x float>* %32, align 4 + %mul42 = fmul fast <4 x float> %33, %.pre168 + %add43 = fadd fast <4 x float> %add38, %mul42 + %34 = bitcast float* %pOutput.0.lcssa to <4 x float>* + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %add43, <4 x float>* %34, i32 4, <4 x i1> %21) + %.pre = load float*, float** %pState1, align 4 + br label %if.end + +if.end: ; preds = %while.end, %entry + %35 = phi float* [ %.pre, %while.end ], [ %0, %entry ] + %arrayidx45 = getelementptr inbounds float, float* %35, i32 %blockSize + %shr47 = lshr i32 %conv, 2 + %cmp49141 = icmp eq i32 %shr47, 0 + br i1 %cmp49141, label %while.end55, label %while.body51.preheader + +while.body51.preheader: ; preds = %if.end + %36 = and i32 %conv, 65532 + %37 = add i32 %36, %blockSize + %scevgep = getelementptr float, float* %35, i32 %37 + br label %while.body51 + +while.body51: ; preds = %while.body51.preheader, %while.body51 + %pTempSrc.1144 = phi float* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ] + %pTempDest.0143 = phi float* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ] + %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ] + %38 = bitcast float* %pTempSrc.1144 to <4 x float>* + %39 = load <4 x float>, <4 x float>* %38, align 4 + %40 = bitcast float* %pTempDest.0143 to <4 x float>* + store <4 x float> %39, <4 x float>* %40, align 4 + %add.ptr52 = getelementptr inbounds float, float* %pTempSrc.1144, i32 4 + %add.ptr53 = getelementptr inbounds float, float* %pTempDest.0143, i32 4 + %dec54 = add nsw i32 %blkCnt.1142, -1 + %cmp49 = icmp eq i32 %dec54, 0 + br i1 %cmp49, label %while.end55.loopexit, label %while.body51 + +while.end55.loopexit: ; preds = %while.body51 + %scevgep156 = getelementptr float, float* %35, i32 %36 + br label %while.end55 + +while.end55: ; preds = %while.end55.loopexit, %if.end + %pTempDest.0.lcssa = phi float* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ] + %pTempSrc.1.lcssa = phi float* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ] + %and56 = and i32 %conv, 3 + %cmp57 = icmp eq i32 %and56, 0 + br i1 %cmp57, label %if.end61, label %if.then59 + +if.then59: ; preds = %while.end55 + %41 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and56) + %42 = bitcast float* %pTempSrc.1.lcssa to <4 x float>* + %43 = load <4 x float>, <4 x float>* %42, align 4 + %44 = bitcast float* %pTempDest.0.lcssa to <4 x float>* + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %43, <4 x float>* %44, i32 4, <4 x i1> %41) + br label %if.end61 + +if.end61: ; preds = %while.end55, %if.then59 + ret void +} + + +define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) { +; CHECK-LABEL: fir: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: cmp r3, #8 +; CHECK-NEXT: blo.w .LBB16_12 +; CHECK-NEXT: @ %bb.1: @ %if.then +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: cmp.w r7, r3, lsr #2 +; CHECK-NEXT: beq.w .LBB16_12 +; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: lsr.w r8, r3, #2 +; CHECK-NEXT: ldrd r5, r12, [r0, #4] +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r0, r4, #8 +; CHECK-NEXT: and r10, r0, #7 +; CHECK-NEXT: add.w r7, r0, r0, lsr #29 +; CHECK-NEXT: add.w r0, r10, #1 +; CHECK-NEXT: asrs r6, r7, #3 +; CHECK-NEXT: cmp r6, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: asrgt r3, r7, #3 +; CHECK-NEXT: add.w r7, r5, r4, lsl #2 +; CHECK-NEXT: sub.w r11, r7, #4 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: rsbs r3, r4, #0 +; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: add.w r3, r12, #32 +; CHECK-NEXT: str r4, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: b .LBB16_4 +; CHECK-NEXT: .LBB16_3: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: subs.w r8, r8, #1 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: add.w r0, r9, r0, lsl #2 +; CHECK-NEXT: add.w r5, r0, #16 +; CHECK-NEXT: beq.w .LBB16_12 +; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 +; CHECK-NEXT: vldr s2, [r12, #12] +; CHECK-NEXT: vldrw.u32 q3, [r1], #16 +; CHECK-NEXT: vldr s8, [r12, #28] +; CHECK-NEXT: add.w r9, r5, #32 +; CHECK-NEXT: vldr s0, [r12] +; CHECK-NEXT: vstr s2, [sp, #64] @ 4-byte Spill +; CHECK-NEXT: vmov r6, s8 +; CHECK-NEXT: vldr s2, [r12, #16] +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldr s4, [r12, #20] +; CHECK-NEXT: vldr s6, [r12, #24] +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vldr s5, [r12, #4] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldr s7, [r12, #8] +; CHECK-NEXT: vstrb.8 q3, [r11], #16 +; CHECK-NEXT: vldrw.u32 q2, [r5, #28] +; CHECK-NEXT: vldrw.u32 q4, [r5] +; CHECK-NEXT: vldrw.u32 q5, [r5, #4] +; CHECK-NEXT: vldrw.u32 q3, [r5, #20] +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r5, #24] +; CHECK-NEXT: vldrw.u32 q6, [r5, #12] +; CHECK-NEXT: vldrw.u32 q7, [r5, #16] +; CHECK-NEXT: vmul.f32 q0, q4, r3 +; CHECK-NEXT: vldrw.u32 q4, [r5, #8] +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vfma.f32 q0, q5, r3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vfma.f32 q0, q4, r3 +; CHECK-NEXT: vldr s4, [sp, #64] @ 4-byte Reload +; CHECK-NEXT: vmov r7, s6 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vfma.f32 q0, q6, r3 +; CHECK-NEXT: vfma.f32 q0, q7, r4 +; CHECK-NEXT: vfma.f32 q0, q3, r0 +; CHECK-NEXT: vfma.f32 q0, q2, r7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vfma.f32 q0, q1, r6 +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #16 +; CHECK-NEXT: blo .LBB16_8 +; CHECK-NEXT: @ %bb.5: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr.w lr, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: .LBB16_6: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldrw.u32 q1, [r9, #28] +; CHECK-NEXT: vldr s24, [r6] +; CHECK-NEXT: vldr s26, [r6, #4] +; CHECK-NEXT: vldrw.u32 q3, [r9, #4] +; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r9, #20] +; CHECK-NEXT: vldr s28, [r6, #8] +; CHECK-NEXT: vmov r7, s24 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r9, #24] +; CHECK-NEXT: vldr s25, [r6, #16] +; CHECK-NEXT: vldrw.u32 q5, [r9, #12] +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r9] +; CHECK-NEXT: vldr s27, [r6, #20] +; CHECK-NEXT: vldrw.u32 q4, [r9, #16] +; CHECK-NEXT: vldr s29, [r6, #24] +; CHECK-NEXT: vldrw.u32 q2, [r9, #8] +; CHECK-NEXT: vldr s31, [r6, #28] +; CHECK-NEXT: vmov r5, s25 +; CHECK-NEXT: vldr s30, [r6, #12] +; CHECK-NEXT: vfma.f32 q0, q1, r7 +; CHECK-NEXT: vmov r7, s26 +; CHECK-NEXT: add.w r9, r9, #32 +; CHECK-NEXT: vfma.f32 q0, q3, r7 +; CHECK-NEXT: vmov r7, s28 +; CHECK-NEXT: vfma.f32 q0, q2, r7 +; CHECK-NEXT: vmov r7, s30 +; CHECK-NEXT: vfma.f32 q0, q5, r7 +; CHECK-NEXT: vmov r3, s27 +; CHECK-NEXT: vfma.f32 q0, q4, r5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov r4, s29 +; CHECK-NEXT: adds r6, #32 +; CHECK-NEXT: vfma.f32 q0, q1, r3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s31 +; CHECK-NEXT: vfma.f32 q0, q1, r4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vfma.f32 q0, q1, r0 +; CHECK-NEXT: le lr, .LBB16_6 +; CHECK-NEXT: @ %bb.7: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: bne .LBB16_9 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: beq.w .LBB16_3 +; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: .LBB16_10: @ %while.body76 +; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldr s4, [r6] +; CHECK-NEXT: vldrw.u32 q2, [r5], #4 +; CHECK-NEXT: subs r0, #1 +; CHECK-NEXT: adds r6, #4 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: cmp r0, #1 +; CHECK-NEXT: vfma.f32 q0, q2, r3 +; CHECK-NEXT: bgt .LBB16_10 +; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: add.w r9, r9, r10, lsl #2 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_12: @ %if.end +; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +entry: + %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 + %0 = load float*, float** %pState1, align 4 + %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2 + %1 = load float*, float** %pCoeffs2, align 4 + %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0 + %2 = load i16, i16* %numTaps3, align 4 + %conv = zext i16 %2 to i32 + %cmp = icmp ugt i32 %blockSize, 7 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %shr = lshr i32 %blockSize, 2 + %cmp5217 = icmp eq i32 %shr, 0 + br i1 %cmp5217, label %if.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %if.then + %sub = add nsw i32 %conv, -1 + %arrayidx = getelementptr inbounds float, float* %0, i32 %sub + %incdec.ptr = getelementptr inbounds float, float* %1, i32 1 + %incdec.ptr7 = getelementptr inbounds float, float* %1, i32 2 + %incdec.ptr8 = getelementptr inbounds float, float* %1, i32 3 + %incdec.ptr9 = getelementptr inbounds float, float* %1, i32 4 + %incdec.ptr10 = getelementptr inbounds float, float* %1, i32 5 + %incdec.ptr11 = getelementptr inbounds float, float* %1, i32 6 + %incdec.ptr12 = getelementptr inbounds float, float* %1, i32 7 + %sub37 = add nsw i32 %conv, -8 + %div = sdiv i32 %sub37, 8 + %pCoeffsCur.0199 = getelementptr inbounds float, float* %1, i32 8 + %cmp38201 = icmp ugt i16 %2, 15 + %and = and i32 %sub37, 7 + %cmp74210 = icmp eq i32 %and, 0 + %idx.neg = sub nsw i32 0, %conv + %3 = icmp sgt i32 %div, 1 + %smax = select i1 %3, i32 %div, i32 1 + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.end + %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ] + %pStateCur.0221 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ] + %pSamples.0220 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ] + %pTempSrc.0219 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ] + %pOutput.0218 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ] + %4 = load float, float* %1, align 4 + %5 = load float, float* %incdec.ptr, align 4 + %6 = load float, float* %incdec.ptr7, align 4 + %7 = load float, float* %incdec.ptr8, align 4 + %8 = load float, float* %incdec.ptr9, align 4 + %9 = load float, float* %incdec.ptr10, align 4 + %10 = load float, float* %incdec.ptr11, align 4 + %11 = load float, float* %incdec.ptr12, align 4 + %12 = bitcast float* %pTempSrc.0219 to <4 x float>* + %13 = load <4 x float>, <4 x float>* %12, align 4 + %14 = bitcast float* %pStateCur.0221 to <4 x float>* + store <4 x float> %13, <4 x float>* %14, align 4 + %add.ptr = getelementptr inbounds float, float* %pStateCur.0221, i32 4 + %add.ptr14 = getelementptr inbounds float, float* %pTempSrc.0219, i32 4 + %15 = bitcast float* %pSamples.0220 to <4 x float>* + %16 = load <4 x float>, <4 x float>* %15, align 4 + %.splatinsert = insertelement <4 x float> undef, float %4, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + %17 = fmul fast <4 x float> %16, %.splat + %arrayidx15 = getelementptr inbounds float, float* %pSamples.0220, i32 1 + %18 = bitcast float* %arrayidx15 to <4 x float>* + %19 = load <4 x float>, <4 x float>* %18, align 4 + %.splatinsert16 = insertelement <4 x float> undef, float %5, i32 0 + %.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer + %20 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %19, <4 x float> %.splat17, <4 x float> %17) + %arrayidx18 = getelementptr inbounds float, float* %pSamples.0220, i32 2 + %21 = bitcast float* %arrayidx18 to <4 x float>* + %22 = load <4 x float>, <4 x float>* %21, align 4 + %.splatinsert19 = insertelement <4 x float> undef, float %6, i32 0 + %.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer + %23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %22, <4 x float> %.splat20, <4 x float> %20) + %arrayidx21 = getelementptr inbounds float, float* %pSamples.0220, i32 3 + %24 = bitcast float* %arrayidx21 to <4 x float>* + %25 = load <4 x float>, <4 x float>* %24, align 4 + %.splatinsert22 = insertelement <4 x float> undef, float %7, i32 0 + %.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer + %26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %25, <4 x float> %.splat23, <4 x float> %23) + %arrayidx24 = getelementptr inbounds float, float* %pSamples.0220, i32 4 + %27 = bitcast float* %arrayidx24 to <4 x float>* + %28 = load <4 x float>, <4 x float>* %27, align 4 + %.splatinsert25 = insertelement <4 x float> undef, float %8, i32 0 + %.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer + %29 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %28, <4 x float> %.splat26, <4 x float> %26) + %arrayidx27 = getelementptr inbounds float, float* %pSamples.0220, i32 5 + %30 = bitcast float* %arrayidx27 to <4 x float>* + %31 = load <4 x float>, <4 x float>* %30, align 4 + %.splatinsert28 = insertelement <4 x float> undef, float %9, i32 0 + %.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer + %32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %31, <4 x float> %.splat29, <4 x float> %29) + %arrayidx30 = getelementptr inbounds float, float* %pSamples.0220, i32 6 + %33 = bitcast float* %arrayidx30 to <4 x float>* + %34 = load <4 x float>, <4 x float>* %33, align 4 + %.splatinsert31 = insertelement <4 x float> undef, float %10, i32 0 + %.splat32 = shufflevector <4 x float> %.splatinsert31, <4 x float> undef, <4 x i32> zeroinitializer + %35 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %34, <4 x float> %.splat32, <4 x float> %32) + %arrayidx33 = getelementptr inbounds float, float* %pSamples.0220, i32 7 + %36 = bitcast float* %arrayidx33 to <4 x float>* + %37 = load <4 x float>, <4 x float>* %36, align 4 + %.splatinsert34 = insertelement <4 x float> undef, float %11, i32 0 + %.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer + %38 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %37, <4 x float> %.splat35, <4 x float> %35) + %pSamples.1200 = getelementptr inbounds float, float* %pSamples.0220, i32 8 + br i1 %cmp38201, label %for.body, label %for.end + +for.body: ; preds = %while.body, %for.body + %pSamples.1207 = phi float* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ] + %pCoeffsCur.0206 = phi float* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ] + %.pn205 = phi float* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ] + %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ] + %vecAcc0.0203 = phi <4 x float> [ %70, %for.body ], [ %38, %while.body ] + %pSamples.0.pn202 = phi float* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ] + %incdec.ptr40 = getelementptr inbounds float, float* %.pn205, i32 9 + %39 = load float, float* %pCoeffsCur.0206, align 4 + %incdec.ptr41 = getelementptr inbounds float, float* %.pn205, i32 10 + %40 = load float, float* %incdec.ptr40, align 4 + %incdec.ptr42 = getelementptr inbounds float, float* %.pn205, i32 11 + %41 = load float, float* %incdec.ptr41, align 4 + %incdec.ptr43 = getelementptr inbounds float, float* %.pn205, i32 12 + %42 = load float, float* %incdec.ptr42, align 4 + %incdec.ptr44 = getelementptr inbounds float, float* %.pn205, i32 13 + %43 = load float, float* %incdec.ptr43, align 4 + %incdec.ptr45 = getelementptr inbounds float, float* %.pn205, i32 14 + %44 = load float, float* %incdec.ptr44, align 4 + %incdec.ptr46 = getelementptr inbounds float, float* %.pn205, i32 15 + %45 = load float, float* %incdec.ptr45, align 4 + %46 = load float, float* %incdec.ptr46, align 4 + %47 = bitcast float* %pSamples.1207 to <4 x float>* + %48 = load <4 x float>, <4 x float>* %47, align 4 + %.splatinsert48 = insertelement <4 x float> undef, float %39, i32 0 + %.splat49 = shufflevector <4 x float> %.splatinsert48, <4 x float> undef, <4 x i32> zeroinitializer + %49 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %48, <4 x float> %.splat49, <4 x float> %vecAcc0.0203) + %arrayidx50 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 9 + %50 = bitcast float* %arrayidx50 to <4 x float>* + %51 = load <4 x float>, <4 x float>* %50, align 4 + %.splatinsert51 = insertelement <4 x float> undef, float %40, i32 0 + %.splat52 = shufflevector <4 x float> %.splatinsert51, <4 x float> undef, <4 x i32> zeroinitializer + %52 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %51, <4 x float> %.splat52, <4 x float> %49) + %arrayidx53 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 10 + %53 = bitcast float* %arrayidx53 to <4 x float>* + %54 = load <4 x float>, <4 x float>* %53, align 4 + %.splatinsert54 = insertelement <4 x float> undef, float %41, i32 0 + %.splat55 = shufflevector <4 x float> %.splatinsert54, <4 x float> undef, <4 x i32> zeroinitializer + %55 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %54, <4 x float> %.splat55, <4 x float> %52) + %arrayidx56 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 11 + %56 = bitcast float* %arrayidx56 to <4 x float>* + %57 = load <4 x float>, <4 x float>* %56, align 4 + %.splatinsert57 = insertelement <4 x float> undef, float %42, i32 0 + %.splat58 = shufflevector <4 x float> %.splatinsert57, <4 x float> undef, <4 x i32> zeroinitializer + %58 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %57, <4 x float> %.splat58, <4 x float> %55) + %arrayidx59 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 12 + %59 = bitcast float* %arrayidx59 to <4 x float>* + %60 = load <4 x float>, <4 x float>* %59, align 4 + %.splatinsert60 = insertelement <4 x float> undef, float %43, i32 0 + %.splat61 = shufflevector <4 x float> %.splatinsert60, <4 x float> undef, <4 x i32> zeroinitializer + %61 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %60, <4 x float> %.splat61, <4 x float> %58) + %arrayidx62 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 13 + %62 = bitcast float* %arrayidx62 to <4 x float>* + %63 = load <4 x float>, <4 x float>* %62, align 4 + %.splatinsert63 = insertelement <4 x float> undef, float %44, i32 0 + %.splat64 = shufflevector <4 x float> %.splatinsert63, <4 x float> undef, <4 x i32> zeroinitializer + %64 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %63, <4 x float> %.splat64, <4 x float> %61) + %arrayidx65 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 14 + %65 = bitcast float* %arrayidx65 to <4 x float>* + %66 = load <4 x float>, <4 x float>* %65, align 4 + %.splatinsert66 = insertelement <4 x float> undef, float %45, i32 0 + %.splat67 = shufflevector <4 x float> %.splatinsert66, <4 x float> undef, <4 x i32> zeroinitializer + %67 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %66, <4 x float> %.splat67, <4 x float> %64) + %arrayidx68 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 15 + %68 = bitcast float* %arrayidx68 to <4 x float>* + %69 = load <4 x float>, <4 x float>* %68, align 4 + %.splatinsert69 = insertelement <4 x float> undef, float %46, i32 0 + %.splat70 = shufflevector <4 x float> %.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer + %70 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %69, <4 x float> %.splat70, <4 x float> %67) + %inc = add nuw nsw i32 %i.0204, 1 + %pCoeffsCur.0 = getelementptr inbounds float, float* %pCoeffsCur.0206, i32 8 + %pSamples.1 = getelementptr inbounds float, float* %pSamples.1207, i32 8 + %exitcond = icmp eq i32 %inc, %smax + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %while.body + %vecAcc0.0.lcssa = phi <4 x float> [ %38, %while.body ], [ %70, %for.body ] + %pCoeffsCur.0.lcssa = phi float* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ] + %pSamples.1.lcssa = phi float* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ] + br i1 %cmp74210, label %while.end, label %while.body76 + +while.body76: ; preds = %for.end, %while.body76 + %pCoeffsCur.1214 = phi float* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ] + %vecAcc0.1213 = phi <4 x float> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ] + %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ] + %pSamples.2211 = phi float* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ] + %incdec.ptr77 = getelementptr inbounds float, float* %pCoeffsCur.1214, i32 1 + %71 = load float, float* %pCoeffsCur.1214, align 4 + %72 = bitcast float* %pSamples.2211 to <4 x float>* + %73 = load <4 x float>, <4 x float>* %72, align 4 + %.splatinsert78 = insertelement <4 x float> undef, float %71, i32 0 + %.splat79 = shufflevector <4 x float> %.splatinsert78, <4 x float> undef, <4 x i32> zeroinitializer + %74 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %73, <4 x float> %.splat79, <4 x float> %vecAcc0.1213) + %incdec.ptr80 = getelementptr inbounds float, float* %pSamples.2211, i32 1 + %dec = add nsw i32 %numCnt.0212, -1 + %cmp74 = icmp sgt i32 %numCnt.0212, 1 + br i1 %cmp74, label %while.body76, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body76 + %scevgep = getelementptr float, float* %pSamples.1.lcssa, i32 %and + br label %while.end + +while.end: ; preds = %while.end.loopexit, %for.end + %pSamples.2.lcssa = phi float* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ] + %vecAcc0.1.lcssa = phi <4 x float> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ] + %75 = bitcast float* %pOutput.0218 to <4 x float>* + store <4 x float> %vecAcc0.1.lcssa, <4 x float>* %75, align 4 + %add.ptr81 = getelementptr inbounds float, float* %pOutput.0218, i32 4 + %add.ptr82 = getelementptr inbounds float, float* %pSamples.2.lcssa, i32 4 + %add.ptr83 = getelementptr inbounds float, float* %add.ptr82, i32 %idx.neg + %dec84 = add nsw i32 %blkCnt.0222, -1 + %cmp5 = icmp eq i32 %dec84, 0 + br i1 %cmp5, label %if.end, label %while.body + +if.end: ; preds = %while.end, %if.then, %entry + ret void +} + +declare void @llvm.assume(i1) +declare <4 x i1> @llvm.arm.mve.vctp32(i32) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll @@ -18,7 +18,6 @@ ; CHECK-NEXT: invoke void @f() ; CHECK-NEXT: to label [[BLOCK3:%.*]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: block3: -; CHECK-NEXT: store i32 30, i32* [[SV]] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: catch.dispatch: ; CHECK-NEXT: [[CS1:%.*]] = catchswitch within none [label %catch] unwind label [[CLEANUP:%.*]] diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll @@ -27,10 +27,9 @@ define void @test14(i32* noalias %P) { ; CHECK-LABEL: @test14( ; CHECK-NEXT: entry: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: br i1 false, label [[FOR]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret void @@ -77,7 +76,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false) ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 @@ -281,3 +281,36 @@ ret void } +%struct.hoge = type { i32, i32 } + +@global = external local_unnamed_addr global %struct.hoge*, align 8 + +define void @widget(i8* %tmp) { +; CHECK-LABEL: @widget( +; CHECK-NEXT: bb: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP:%.*]], i8* nonnull align 16 undef, i64 64, i1 false) +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP2:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[TMP2]], i64 undef, i32 1 +; CHECK-NEXT: store i32 0, i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[TMP4]], i64 undef, i32 1 +; CHECK-NEXT: store i32 10, i32* [[TMP5]], align 4 +; CHECK-NEXT: br label [[BB1]] +; +bb: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %tmp, i8* nonnull align 16 undef, i64 64, i1 false) + br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp2 = load %struct.hoge*, %struct.hoge** @global, align 8 + %tmp3 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp2, i64 undef, i32 1 + store i32 0, i32* %tmp3, align 4 + %tmp4 = load %struct.hoge*, %struct.hoge** @global, align 8 + %tmp5 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp4, i64 undef, i32 1 + store i32 10, i32* %tmp5, align 4 + br label %bb1 +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll @@ -33,13 +33,11 @@ ; CHECK-LABEL: @test5( ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: store i32 1, i32* [[P]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: ret void ; br i1 true, label %bb1, label %bb2 @@ -58,13 +56,12 @@ ; CHECK-LABEL: @test8( ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: store i32 1, i32* [[Q:%.*]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: ret void ; br i1 true, label %bb1, label %bb2 @@ -115,7 +112,6 @@ ; CHECK: bb1: ; CHECK-NEXT: br i1 [[C2:%.*]], label [[BB2:%.*]], label [[BB3]] ; CHECK: bb2: -; CHECK-NEXT: store i32 -1, i32* [[PTR:%.*]], align 4 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: br label [[BB4:%.*]] @@ -126,7 +122,7 @@ ; CHECK-NEXT: i32 2, label [[BB7:%.*]] ; CHECK-NEXT: ] ; CHECK: bb5: -; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 +; CHECK-NEXT: store i32 0, i32* [[PTR:%.*]], align 4 ; CHECK-NEXT: br label [[BB8]] ; CHECK: bb6: ; CHECK-NEXT: store i32 1, i32* [[PTR]], align 4 @@ -173,3 +169,34 @@ bb8: ; preds = %bb7, %bb6, %bb5, %bb4 br label %bb4 } + + +declare void @fn1_test11() +declare void @fn2_test11() + +define void @test11(i1 %c, i8** %ptr.1) { +; CHECK-LABEL: @test11( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @fn2_test11() #0 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: store i8* null, i8** [[PTR_1:%.*]], align 8 +; CHECK-NEXT: tail call void @fn2_test11() #0 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %if.then, label %exit + +if.then: ; preds = %entry + tail call void @fn2_test11() #1 + br label %exit + +exit: + store i8* null, i8** %ptr.1, align 8 + tail call void @fn2_test11() #1 + ret void +} + +attributes #1 = { nounwind } diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll @@ -127,7 +127,7 @@ ; CHECK: bb1: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: ret void +; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: store i32 2, i32* [[P]] ; CHECK-NEXT: ret void @@ -142,8 +142,109 @@ bb1: br label %bb3 bb2: - ret void + br label %bb3 bb3: store i32 2, i32* %P ret void } + +define void @test10(i32* %P) { +; CHECK-LABEL: @test10( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 0, i32* %P + br label %bb3 +bb2: + ret void +bb3: + ret void +} + + +define void @test11() { +; CHECK-LABEL: @test11( +; CHECK-NEXT: [[P:%.*]] = alloca i32 +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + %P = alloca i32 + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 0, i32* %P + br label %bb3 +bb2: + ret void +bb3: + ret void +} + + +define void @test12(i32* %P) { +; CHECK-LABEL: @test12( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 1, i32* %P + br label %bb3 +bb2: + store i32 1, i32* %P + ret void +bb3: + ret void +} + + +define void @test13(i32* %P) { +; CHECK-LABEL: @test13( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 1, i32* %P + br label %bb3 +bb2: + store i32 1, i32* %P + br label %bb3 +bb3: + ret void +} diff --git a/llvm/test/Transforms/InstCombine/align-attr.ll b/llvm/test/Transforms/InstCombine/align-attr.ll --- a/llvm/test/Transforms/InstCombine/align-attr.ll +++ b/llvm/test/Transforms/InstCombine/align-attr.ll @@ -20,7 +20,7 @@ ; CHECK-LABEL: @foo2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[V:%.*]] = call i32* @func1(i32* [[A:%.*]]) -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[V]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 32 ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/call-returned.ll b/llvm/test/Transforms/InstCombine/call-returned.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/call-returned.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF +; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON + +declare i32 @passthru_i32(i32 returned) +declare i8* @passthru_p8(i8* returned) + +define i32 @returned_const_int_arg() { +; CHECK-LABEL: @returned_const_int_arg( +; CHECK-NEXT: [[X:%.*]] = call i32 @passthru_i32(i32 42) +; CHECK-NEXT: ret i32 42 +; + %x = call i32 @passthru_i32(i32 42) + ret i32 %x +} + +define i8* @returned_const_ptr_arg() { +; CHECK-LABEL: @returned_const_ptr_arg( +; CHECK-NEXT: [[X:%.*]] = call i8* @passthru_p8(i8* null) +; CHECK-NEXT: ret i8* null +; + %x = call i8* @passthru_p8(i8* null) + ret i8* %x +} + +define i32 @returned_var_arg(i32 %arg) { +; CHECK-LABEL: @returned_var_arg( +; CHECK-NEXT: [[X:%.*]] = call i32 @passthru_i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[ARG]] +; + %x = call i32 @passthru_i32(i32 %arg) + ret i32 %x +} + +define i32 @returned_const_int_arg_musttail(i32 %arg) { +; CHECK-LABEL: @returned_const_int_arg_musttail( +; CHECK-NEXT: [[X:%.*]] = musttail call i32 @passthru_i32(i32 42) +; CHECK-NEXT: ret i32 [[X]] +; + %x = musttail call i32 @passthru_i32(i32 42) + ret i32 %x +} + +define i32 @returned_var_arg_musttail(i32 %arg) { +; CHECK-LABEL: @returned_var_arg_musttail( +; CHECK-NEXT: [[X:%.*]] = musttail call i32 @passthru_i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[X]] +; + %x = musttail call i32 @passthru_i32(i32 %arg) + ret i32 %x +} diff --git a/llvm/test/Transforms/InstCombine/expensive-combines.ll b/llvm/test/Transforms/InstCombine/expensive-combines.ll --- a/llvm/test/Transforms/InstCombine/expensive-combines.ll +++ b/llvm/test/Transforms/InstCombine/expensive-combines.ll @@ -16,7 +16,7 @@ ; ; EXPENSIVE-OFF-LABEL: @test( ; EXPENSIVE-OFF-NEXT: [[CALL:%.*]] = call i32 @passthru(i32 0) -; EXPENSIVE-OFF-NEXT: call void @sink(i32 [[CALL]]) +; EXPENSIVE-OFF-NEXT: call void @sink(i32 0) ; EXPENSIVE-OFF-NEXT: ret void ; %call = call i32 @passthru(i32 0) diff --git a/llvm/test/Transforms/InstCombine/fortify-folding.ll b/llvm/test/Transforms/InstCombine/fortify-folding.ll --- a/llvm/test/Transforms/InstCombine/fortify-folding.ll +++ b/llvm/test/Transforms/InstCombine/fortify-folding.ll @@ -82,7 +82,7 @@ define i8* @test_strcat() { ; CHECK-LABEL: @test_strcat( ; CHECK-NEXT: [[STRCAT:%.*]] = call i8* @strcat(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i64 0, i64 0)) -; CHECK-NEXT: ret i8* [[STRCAT]] +; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0) ; %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0 %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0 @@ -126,7 +126,7 @@ define i8* @test_strncat() { ; CHECK-LABEL: @test_strncat( ; CHECK-NEXT: [[STRNCAT:%.*]] = call i8* @strncat(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i64 0, i64 0), i64 22) -; CHECK-NEXT: ret i8* [[STRNCAT]] +; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0) ; %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0 %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0 diff --git a/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll b/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll --- a/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll +++ b/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll @@ -53,7 +53,7 @@ define i8* @test_simplify4() { ; CHECK-LABEL: @test_simplify4( ; CHECK-NEXT: [[STRCPY:%.*]] = call i8* @strcpy(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i32 0, i32 0)) -; CHECK-NEXT: ret i8* [[STRCPY]] +; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0) ; %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0 %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0 diff --git a/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll b/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll --- a/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll +++ b/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll @@ -39,7 +39,7 @@ define i8* @test_simplify3() { ; CHECK-LABEL: @test_simplify3( ; CHECK-NEXT: [[STRNCPY:%.*]] = call i8* @strncpy(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i32 0, i32 0), i32 12) -; CHECK-NEXT: ret i8* [[STRNCPY]] +; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0) ; %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0 %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0 diff --git a/llvm/test/Transforms/InstCombine/unused-nonnull.ll b/llvm/test/Transforms/InstCombine/unused-nonnull.ll --- a/llvm/test/Transforms/InstCombine/unused-nonnull.ll +++ b/llvm/test/Transforms/InstCombine/unused-nonnull.ll @@ -12,13 +12,8 @@ ; CHECK-SAME: (i32 [[ARGC:%.*]], i8** nocapture readnone [[ARGV:%.*]]) local_unnamed_addr #0 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = icmp slt i32 [[ARGC]], 2 -; CHECK-NEXT: br i1 [[TMP0]], label [[DONE:%.*]], label [[DO_WORK:%.*]] -; CHECK: do_work: -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @compute(i8* undef, i32 [[ARGC]]) -; CHECK-NEXT: br label [[DONE]] -; CHECK: done: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[DO_WORK]] ] -; CHECK-NEXT: ret i32 [[RETVAL]] +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP0]], i32 0, i32 [[ARGC]] +; CHECK-NEXT: ret i32 [[SPEC_SELECT]] ; entry: %0 = getelementptr inbounds i8*, i8** %argv, i32 0 diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll --- a/llvm/test/Transforms/InstSimplify/call.ll +++ b/llvm/test/Transforms/InstSimplify/call.ll @@ -978,6 +978,10 @@ ret <2 x double> %r } +; We handle the "returned" attribute only in InstCombine, because the fact +; that this simplification may replace one call with another may cause issues +; for call graph passes. + declare i32 @passthru_i32(i32 returned) declare i8* @passthru_p8(i8* returned) diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-alloca.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-alloca.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MergeSimilarFunc/merge-alloca.ll @@ -0,0 +1,94 @@ +;RUN: opt -mergesimilarfunc -mergesimilarfunc-level=all -S < %s | FileCheck %s +; +; Test whether mergefunc merges allocas of different sizes correctly +; + +target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32" + +%struct.A = type { i32, i32 } +%struct.B = type { i32, i32, i32 } + +; Function Attrs: nounwind optsize +define void @f1() #0 { +; CHECK-LABEL: @f1__merged( +; CHECK: alloca %struct.A +; CHECK: alloca %struct.B +entry: + %a = alloca %struct.A, align 4 + %0 = bitcast %struct.A* %a to i8* + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + ret void +} + +; Function Attrs: optsize +declare void @externalFun(i8*) #1 + +; Function Attrs: nounwind optsize +define void @f2() #0 { +entry: + %a = alloca %struct.B, align 4 + %0 = bitcast %struct.B* %a to i8* + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + call void @externalFun(i8* %0) #2 + ret void +} + +; Function Attrs: nounwind optsize +define void @f3() #0 { +entry: + %a = alloca i8, align 1 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + call void @externalFun(i8* %a) #2 + ret void +} + +attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind optsize } + diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info-2.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info-2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info-2.ll @@ -0,0 +1,101 @@ +; RUN: opt -S -mergesimilarfunc -mergesimilarfunc-diff-min-insts=5 < %s | FileCheck %s +; This used to fail with assertion in CloneFunction +; REQUIRES: asserts +; CHECK-LABEL: @foo__merged( + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios8.0.0" + +%struct.wibble = type { %struct.wibble.0*, %struct.wibble* } +%struct.wibble.0 = type { i64*, i8*, i64*, i64*, %struct.eggs*, %struct.wibble* } +%struct.eggs = type { %struct.wombat*, %struct.eggs* } +%struct.wombat = type { i8*, %struct.blam*, %struct.blam* } +%struct.blam = type { i8*, %struct.blam* } +%struct.snork = type { %struct.bar*, %struct.snork* } +%struct.bar = type { i64*, i8* } + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + +; Function Attrs: minsize nounwind optsize ssp uwtable +define hidden void @foo(%struct.wibble* %arg) #1 align 2 !dbg !4 { +bb: + %tmp = alloca %struct.wibble*, align 8 + %tmp2 = load %struct.wibble*, %struct.wibble** %tmp, align 8, !dbg !12 + %tmp3 = icmp ne %struct.wibble* %tmp2, null, !dbg !13 + br i1 %tmp3, label %bb4, label %bb13, !dbg !14 + +bb4: ; preds = %bb + call void @foo.1() #3, !dbg !26 + unreachable + +bb13: ; preds = %bb + ret void, !dbg !27 +} + +; Function Attrs: minsize nounwind optsize ssp uwtable +declare hidden void @foo.1() #1 align 2 + +; Function Attrs: minsize nounwind optsize ssp uwtable +define void @quux() unnamed_addr #1 align 2 !dbg !28 { +bb: + ret void +} + +; Function Attrs: minsize nounwind optsize ssp uwtable +define hidden void @baz(%struct.snork* %arg) #1 align 2 !dbg !30 { +bb: + %tmp = alloca %struct.snork*, align 8 + %tmp2 = load %struct.snork*, %struct.snork** %tmp, align 8, !dbg !31 + %tmp3 = icmp ne %struct.snork* %tmp2, null, !dbg !32 + br i1 %tmp3, label %bb4, label %bb13, !dbg !33 + +bb4: ; preds = %bb + call void @blam() #3, !dbg !42 + unreachable + +bb13: ; preds = %bb + ret void, !dbg !43 +} + +; Function Attrs: minsize nounwind optsize ssp uwtable +declare hidden void @blam() #1 align 2 + +attributes #0 = { argmemonly nounwind } +attributes #1 = { minsize nounwind optsize ssp uwtable } +attributes #2 = { nounwind } +attributes #3 = { minsize optsize } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "(based on LLVM 5.0.0)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "foo.cpp", directory: "/") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = distinct !DISubprogram(name: "delete", scope: !5, file: !5, line: 31, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!5 = !DIFile(filename: "foo.h", directory: "/") +!6 = !DISubroutineType(types: !7) +!7 = !{} +!9 = !{!"any pointer", !10, i64 0} +!10 = !{!"omnipotent char", !11, i64 0} +!11 = !{!"Simple C++ TBAA"} +!12 = !DILocation(line: 33, column: 11, scope: !4) +!13 = !DILocation(line: 33, column: 16, scope: !4) +!14 = !DILocation(line: 33, column: 5, scope: !4) +!21 = !{!"_ZTSN", !9, i64 0, !9, i64 8} +!23 = !DILocation(line: 37, column: 25, scope: !4) +!24 = !DILocation(line: 37, column: 31, scope: !4) +!25 = !{!21, !9, i64 0} +!26 = !DILocation(line: 37, column: 7, scope: !4) +!27 = !DILocation(line: 41, column: 3, scope: !4) +!28 = distinct !DISubprogram(name: "~destruct", scope: !29, file: !29, line: 31, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!29 = !DIFile(filename: "bar.h", directory: "/") +!30 = distinct !DISubprogram(name: "delete", scope: !5, file: !5, line: 31, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!31 = !DILocation(line: 33, column: 11, scope: !30) +!32 = !DILocation(line: 33, column: 16, scope: !30) +!33 = !DILocation(line: 33, column: 5, scope: !30) +!40 = !DILocation(line: 37, column: 25, scope: !30) +!41 = !DILocation(line: 37, column: 31, scope: !30) +!42 = !DILocation(line: 37, column: 7, scope: !30) +!43 = !DILocation(line: 41, column: 3, scope: !30) diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info.ll @@ -0,0 +1,243 @@ +; This used to fail the verifier with the following error: +; "dbg attachment points at wrong subprogram for function" +; RUN: opt -S -mergesimilarfunc < %s | FileCheck %s +; REQUIRES: asserts +; CHECK-LABEL: @bar__merged( +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" + +%struct.str_type = type { i8*, i32, i32 } + +; Function Attrs: nounwind optsize +define i32 @bar(i8* %b) #0 !dbg !14 { +entry: + %retval = alloca i32, align 4 + %b.addr = alloca i8*, align 4 + %res = alloca i32, align 4 + %ee = alloca %struct.str_type*, align 4 + %cleanup.dest.slot = alloca i32 + store i8* %b, i8** %b.addr, align 4, !tbaa !31 + call void @llvm.dbg.declare(metadata i8** %b.addr, metadata !18, metadata !35), !dbg !36 + %0 = bitcast i32* %res to i8*, !dbg !37 + call void @llvm.lifetime.start(i64 4, i8* %0) #4, !dbg !37 + call void @llvm.dbg.declare(metadata i32* %res, metadata !19, metadata !35), !dbg !38 + %1 = bitcast %struct.str_type** %ee to i8*, !dbg !39 + call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !39 + call void @llvm.dbg.declare(metadata %struct.str_type** %ee, metadata !20, metadata !35), !dbg !40 + %2 = load i8*, i8** %b.addr, align 4, !dbg !41, !tbaa !31 + %3 = bitcast i8* %2 to %struct.str_type*, !dbg !42 + store %struct.str_type* %3, %struct.str_type** %ee, align 4, !dbg !40, !tbaa !31 + %4 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !43, !tbaa !31 + %set = getelementptr inbounds %struct.str_type, %struct.str_type* %4, i32 0, i32 1, !dbg !45 + %5 = load i32, i32* %set, align 4, !dbg !45, !tbaa !46 + %tobool = icmp ne i32 %5, 0, !dbg !43 + br i1 %tobool, label %if.end4, label %if.then, !dbg !49 + +if.then: ; preds = %entry + %6 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !50, !tbaa !31 + %set1 = getelementptr inbounds %struct.str_type, %struct.str_type* %6, i32 0, i32 1, !dbg !52 + store i32 1, i32* %set1, align 4, !dbg !53, !tbaa !46 + %7 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !54, !tbaa !31 + %x = getelementptr inbounds %struct.str_type, %struct.str_type* %7, i32 0, i32 0, !dbg !55 + %8 = load i8*, i8** %x, align 4, !dbg !55, !tbaa !56 + %call = call i32 @foo(i8* %8) #5, !dbg !57 + store i32 %call, i32* %res, align 4, !dbg !58, !tbaa !59 + %9 = load i32, i32* %res, align 4, !dbg !60, !tbaa !59 + %tobool2 = icmp ne i32 %9, 0, !dbg !60 + br i1 %tobool2, label %if.then3, label %if.end, !dbg !62 + +if.then3: ; preds = %if.then + store i32 1, i32* %retval, align 4, !dbg !63 + store i32 1, i32* %cleanup.dest.slot, align 4 + br label %cleanup, !dbg !63 + +if.end: ; preds = %if.then + br label %if.end4, !dbg !64 + +if.end4: ; preds = %if.end, %entry + store i32 0, i32* %retval, align 4, !dbg !65 + store i32 1, i32* %cleanup.dest.slot, align 4 + br label %cleanup, !dbg !65 + +cleanup: ; preds = %if.end4, %if.then3 + %10 = bitcast %struct.str_type** %ee to i8*, !dbg !66 + call void @llvm.lifetime.end(i64 4, i8* %10) #4, !dbg !66 + %11 = bitcast i32* %res to i8*, !dbg !66 + call void @llvm.lifetime.end(i64 4, i8* %11) #4, !dbg !66 + %12 = load i32, i32* %retval, align 4, !dbg !66 + ret i32 %12, !dbg !66 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #2 + +; Function Attrs: optsize +declare i32 @foo(i8*) #3 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #2 + +; Function Attrs: nounwind optsize +define i32 @bar1(i8* %b) #0 !dbg !21 { +entry: + %retval = alloca i32, align 4 + %b.addr = alloca i8*, align 4 + %res = alloca i32, align 4 + %ee = alloca %struct.str_type*, align 4 + %cleanup.dest.slot = alloca i32 + store i8* %b, i8** %b.addr, align 4, !tbaa !31 + call void @llvm.dbg.declare(metadata i8** %b.addr, metadata !23, metadata !35), !dbg !67 + %0 = bitcast i32* %res to i8*, !dbg !68 + call void @llvm.lifetime.start(i64 4, i8* %0) #4, !dbg !68 + call void @llvm.dbg.declare(metadata i32* %res, metadata !24, metadata !35), !dbg !69 + %1 = bitcast %struct.str_type** %ee to i8*, !dbg !70 + call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !70 + call void @llvm.dbg.declare(metadata %struct.str_type** %ee, metadata !25, metadata !35), !dbg !71 + %2 = load i8*, i8** %b.addr, align 4, !dbg !72, !tbaa !31 + %3 = bitcast i8* %2 to %struct.str_type*, !dbg !73 + store %struct.str_type* %3, %struct.str_type** %ee, align 4, !dbg !71, !tbaa !31 + %4 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !74, !tbaa !31 + %get = getelementptr inbounds %struct.str_type, %struct.str_type* %4, i32 0, i32 2, !dbg !76 + %5 = load i32, i32* %get, align 4, !dbg !76, !tbaa !77 + %tobool = icmp ne i32 %5, 0, !dbg !74 + br i1 %tobool, label %if.end4, label %if.then, !dbg !78 + +if.then: ; preds = %entry + %6 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !79, !tbaa !31 + %get1 = getelementptr inbounds %struct.str_type, %struct.str_type* %6, i32 0, i32 2, !dbg !81 + store i32 1, i32* %get1, align 4, !dbg !82, !tbaa !77 + %7 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !83, !tbaa !31 + %x = getelementptr inbounds %struct.str_type, %struct.str_type* %7, i32 0, i32 0, !dbg !84 + %8 = load i8*, i8** %x, align 4, !dbg !84, !tbaa !56 + %call = call i32 @foo(i8* %8) #5, !dbg !85 + store i32 %call, i32* %res, align 4, !dbg !86, !tbaa !59 + %9 = load i32, i32* %res, align 4, !dbg !87, !tbaa !59 + %tobool2 = icmp ne i32 %9, 0, !dbg !87 + br i1 %tobool2, label %if.then3, label %if.end, !dbg !89 + +if.then3: ; preds = %if.then + store i32 1, i32* %retval, align 4, !dbg !90 + store i32 1, i32* %cleanup.dest.slot, align 4 + br label %cleanup, !dbg !90 + +if.end: ; preds = %if.then + br label %if.end4, !dbg !91 + +if.end4: ; preds = %if.end, %entry + store i32 0, i32* %retval, align 4, !dbg !92 + store i32 1, i32* %cleanup.dest.slot, align 4 + br label %cleanup, !dbg !92 + +cleanup: ; preds = %if.end4, %if.then3 + %10 = bitcast %struct.str_type** %ee to i8*, !dbg !93 + call void @llvm.lifetime.end(i64 4, i8* %10) #4, !dbg !93 + %11 = bitcast i32* %res to i8*, !dbg !93 + call void @llvm.lifetime.end(i64 4, i8* %11) #4, !dbg !93 + %12 = load i32, i32* %retval, align 4, !dbg !93 + ret i32 %12, !dbg !93 +} + +attributes #0 = { nounwind optsize } +attributes #1 = { nounwind readnone } +attributes #2 = { argmemonly nounwind } +attributes #3 = { optsize } +attributes #4 = { nounwind } +attributes #5 = { optsize } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!26, !27} +!llvm.ident = !{!28} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.9.0)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3) +!1 = !DIFile(filename: "test.i", directory: "/local/mnt/") +!2 = !{} +!3 = !{!4} +!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32, align: 32) +!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "str_type", file: !1, line: 8, baseType: !6) +!6 = !DICompositeType(tag: DW_TAG_structure_type, name: "str_type", file: !1, line: 3, size: 96, align: 32, elements: !7) +!7 = !{!8, !10, !12} +!8 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !6, file: !1, line: 5, baseType: !9, size: 32, align: 32) +!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 32, align: 32) +!10 = !DIDerivedType(tag: DW_TAG_member, name: "set", scope: !6, file: !1, line: 6, baseType: !11, size: 32, align: 32, offset: 32) +!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!12 = !DIDerivedType(tag: DW_TAG_member, name: "get", scope: !6, file: !1, line: 7, baseType: !11, size: 32, align: 32, offset: 64) +!14 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 10, type: !15, isLocal: false, isDefinition: true, scopeLine: 10, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !17) +!15 = !DISubroutineType(types: !16) +!16 = !{!11, !9} +!17 = !{!18, !19, !20} +!18 = !DILocalVariable(name: "b", arg: 1, scope: !14, file: !1, line: 10, type: !9) +!19 = !DILocalVariable(name: "res", scope: !14, file: !1, line: 11, type: !11) +!20 = !DILocalVariable(name: "ee", scope: !14, file: !1, line: 12, type: !4) +!21 = distinct !DISubprogram(name: "bar1", scope: !1, file: !1, line: 24, type: !15, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !22) +!22 = !{!23, !24, !25} +!23 = !DILocalVariable(name: "b", arg: 1, scope: !21, file: !1, line: 24, type: !9) +!24 = !DILocalVariable(name: "res", scope: !21, file: !1, line: 25, type: !11) +!25 = !DILocalVariable(name: "ee", scope: !21, file: !1, line: 26, type: !4) +!26 = !{i32 2, !"Dwarf Version", i32 4} +!27 = !{i32 2, !"Debug Info Version", i32 3} +!28 = !{!"Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.9.0)"} +!31 = !{!32, !32, i64 0} +!32 = !{!"any pointer", !33, i64 0} +!33 = !{!"omnipotent char", !34, i64 0} +!34 = !{!"Simple C/C++ TBAA"} +!35 = !DIExpression() +!36 = !DILocation(line: 10, column: 23, scope: !14) +!37 = !DILocation(line: 11, column: 3, scope: !14) +!38 = !DILocation(line: 11, column: 7, scope: !14) +!39 = !DILocation(line: 12, column: 3, scope: !14) +!40 = !DILocation(line: 12, column: 13, scope: !14) +!41 = !DILocation(line: 12, column: 31, scope: !14) +!42 = !DILocation(line: 12, column: 19, scope: !14) +!43 = !DILocation(line: 14, column: 8, scope: !44) +!44 = distinct !DILexicalBlock(scope: !14, file: !1, line: 14, column: 7) +!45 = !DILocation(line: 14, column: 12, scope: !44) +!46 = !{!47, !48, i64 4} +!47 = !{!"str_type", !32, i64 0, !48, i64 4, !48, i64 8} +!48 = !{!"int", !33, i64 0} +!49 = !DILocation(line: 14, column: 7, scope: !14) +!50 = !DILocation(line: 15, column: 5, scope: !51) +!51 = distinct !DILexicalBlock(scope: !44, file: !1, line: 14, column: 18) +!52 = !DILocation(line: 15, column: 9, scope: !51) +!53 = !DILocation(line: 15, column: 13, scope: !51) +!54 = !DILocation(line: 17, column: 16, scope: !51) +!55 = !DILocation(line: 17, column: 20, scope: !51) +!56 = !{!47, !32, i64 0} +!57 = !DILocation(line: 17, column: 11, scope: !51) +!58 = !DILocation(line: 17, column: 9, scope: !51) +!59 = !{!48, !48, i64 0} +!60 = !DILocation(line: 18, column: 9, scope: !61) +!61 = distinct !DILexicalBlock(scope: !51, file: !1, line: 18, column: 9) +!62 = !DILocation(line: 18, column: 9, scope: !51) +!63 = !DILocation(line: 19, column: 7, scope: !61) +!64 = !DILocation(line: 20, column: 3, scope: !51) +!65 = !DILocation(line: 21, column: 3, scope: !14) +!66 = !DILocation(line: 22, column: 1, scope: !14) +!67 = !DILocation(line: 24, column: 24, scope: !21) +!68 = !DILocation(line: 25, column: 3, scope: !21) +!69 = !DILocation(line: 25, column: 7, scope: !21) +!70 = !DILocation(line: 26, column: 3, scope: !21) +!71 = !DILocation(line: 26, column: 13, scope: !21) +!72 = !DILocation(line: 26, column: 31, scope: !21) +!73 = !DILocation(line: 26, column: 19, scope: !21) +!74 = !DILocation(line: 28, column: 8, scope: !75) +!75 = distinct !DILexicalBlock(scope: !21, file: !1, line: 28, column: 7) +!76 = !DILocation(line: 28, column: 12, scope: !75) +!77 = !{!47, !48, i64 8} +!78 = !DILocation(line: 28, column: 7, scope: !21) +!79 = !DILocation(line: 29, column: 5, scope: !80) +!80 = distinct !DILexicalBlock(scope: !75, file: !1, line: 28, column: 17) +!81 = !DILocation(line: 29, column: 9, scope: !80) +!82 = !DILocation(line: 29, column: 13, scope: !80) +!83 = !DILocation(line: 31, column: 16, scope: !80) +!84 = !DILocation(line: 31, column: 20, scope: !80) +!85 = !DILocation(line: 31, column: 11, scope: !80) +!86 = !DILocation(line: 31, column: 9, scope: !80) +!87 = !DILocation(line: 32, column: 9, scope: !88) +!88 = distinct !DILexicalBlock(scope: !80, file: !1, line: 32, column: 9) +!89 = !DILocation(line: 32, column: 9, scope: !80) +!90 = !DILocation(line: 33, column: 7, scope: !88) +!91 = !DILocation(line: 34, column: 3, scope: !80) +!92 = !DILocation(line: 35, column: 3, scope: !21) +!93 = !DILocation(line: 36, column: 1, scope: !21) diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent-template.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent-template.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent-template.ll @@ -0,0 +1,138 @@ +; RUN: opt -S -mergesimilarfunc %s -o - | FileCheck %s + +; CHECK: define linkonce_odr void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* nocapture readnone %this, %struct.FooStruct* %arg0) #1 align 2 { +; CHECK-NEXT: entry: +; CHECK-NEXT: %field27 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0, i32 0, i32 2 +; CHECK-NEXT: %0 = load i8, i8* %field27, align 1 +; CHECK-NEXT: %lnot8 = icmp eq i8 %0, 0 +; CHECK-NEXT: br i1 %lnot8, label %for.body, label %for.end + +; CHECK: for.body: ; preds = %for.body, %entry +; CHECK-NEXT: %arg0.addr.09 = phi %struct.FooStruct* [ %2, %for.body ], [ %arg0, %entry ] +; CHECK-NEXT: %field1 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 1 +; CHECK-NEXT: %1 = load %struct.FooStruct*, %struct.FooStruct** %field1, align 4 +; CHECK-NEXT: tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* %this, %struct.FooStruct* %1) #2 +; CHECK-NEXT: %field0 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 0 +; CHECK-NEXT: %2 = load %struct.FooStruct*, %struct.FooStruct** %field0, align 4 +; CHECK-NEXT: %3 = bitcast %struct.FooStruct* %arg0.addr.09 to i8* +; CHECK-NEXT: tail call void @_Z4bar0Pv(i8* %3) #3 +; CHECK-NEXT: tail call void @_Z4bar1Pv(i8* %3) #3 +; CHECK-NEXT: %field2 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %2, i32 0, i32 2 +; CHECK-NEXT: %4 = load i8, i8* %field2, align 1 +; CHECK-NEXT: %lnot = icmp eq i8 %4, 0 +; CHECK-NEXT: br i1 %lnot, label %for.body, label %for.end + +; CHECK: for.end: ; preds = %for.body, %entry +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define linkonce_odr void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* nocapture readnone, %struct.FooStruct*) #1 align 2 { +; CHECK-NEXT: %3 = bitcast %class.FooTemplate* %0 to %class.FooTemplate.0* +; CHECK-NEXT: tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* nocapture readnone %3, %struct.FooStruct* %1) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +; CHECK-NEXT: attributes #1 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +; CHECK-NEXT: attributes #2 = { optsize } +; CHECK-NEXT: attributes #3 = { nounwind optsize } + +target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32" + +%class.Foo = type { %class.FooTemplate, %class.FooTemplate.0 } +%class.FooTemplate = type { i8 } +%class.FooTemplate.0 = type { i8 } +%struct.FooStruct = type { %struct.FooStruct*, %struct.FooStruct*, i8 } + +@_ZN3FooD1Ev = alias void(%class.Foo*), void (%class.Foo*)* @_ZN3FooD2Ev + +declare %struct.FooStruct* @_ZNK11FooTemplateIPvE7method2Ev(%class.FooTemplate.0*) #1 +declare void @_Z4bar0Pv(i8*) #1 +declare void @_Z4bar1Pv(i8*) #1 +declare %struct.FooStruct* @_ZNK11FooTemplateIiE7method2Ev(%class.FooTemplate*) #1 + +; Function Attrs: nounwind optsize ssp +define void @_ZN3FooD2Ev(%class.Foo* %this) unnamed_addr #0 align 2 { +entry: + %bar_things = getelementptr inbounds %class.Foo, %class.Foo* %this, i32 0, i32 0 + tail call void @_ZN11FooTemplateIiE7method0Ev(%class.FooTemplate* %bar_things) #2 + %baz_things = getelementptr inbounds %class.Foo, %class.Foo* %this, i32 0, i32 1 + tail call void @_ZN11FooTemplateIPvE7method0Ev(%class.FooTemplate.0* %baz_things) #2 + ret void +} + +; Function Attrs: nounwind optsize ssp +define linkonce_odr void @_ZN11FooTemplateIiE7method0Ev(%class.FooTemplate* %this) #0 align 2 { +entry: + %call = tail call %struct.FooStruct* @_ZNK11FooTemplateIiE7method2Ev(%class.FooTemplate* %this) #3 + tail call void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* %this, %struct.FooStruct* %call) #2 + ret void +} + +; Function Attrs: nounwind optsize ssp +define linkonce_odr void @_ZN11FooTemplateIPvE7method0Ev(%class.FooTemplate.0* %this) #0 align 2 { +entry: + %call = tail call %struct.FooStruct* @_ZNK11FooTemplateIPvE7method2Ev(%class.FooTemplate.0* %this) #3 + tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* %this, %struct.FooStruct* %call) #2 + ret void +} + +; Function Attrs: nounwind optsize ssp +define linkonce_odr void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* nocapture readnone %this, %struct.FooStruct* %arg0) #0 align 2 { +entry: + %field27 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0, i32 0, i32 2 + %0 = load i8, i8* %field27, align 1 + %lnot8 = icmp eq i8 %0, 0 + br i1 %lnot8, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %arg0.addr.09 = phi %struct.FooStruct* [ %2, %for.body ], [ %arg0, %entry ] + %field1 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 1 + %1 = load %struct.FooStruct*, %struct.FooStruct** %field1, align 4 + tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* %this, %struct.FooStruct* %1) #2 + %field0 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 0 + %2 = load %struct.FooStruct*, %struct.FooStruct** %field0, align 4 + %3 = bitcast %struct.FooStruct* %arg0.addr.09 to i8* + tail call void @_Z4bar0Pv(i8* %3) #3 + tail call void @_Z4bar1Pv(i8* %3) #3 + %field2 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %2, i32 0, i32 2 + %4 = load i8, i8* %field2, align 1 + %lnot = icmp eq i8 %4, 0 + br i1 %lnot, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +; Function Attrs: nounwind optsize ssp +define linkonce_odr void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* nocapture readnone %this, %struct.FooStruct* %arg0) #0 align 2 { +entry: + %field27 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0, i32 0, i32 2 + %0 = load i8, i8* %field27, align 1 + %lnot8 = icmp eq i8 %0, 0 + br i1 %lnot8, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %arg0.addr.09 = phi %struct.FooStruct* [ %2, %for.body ], [ %arg0, %entry ] + %field1 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 1 + %1 = load %struct.FooStruct*, %struct.FooStruct** %field1, align 4 + tail call void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* %this, %struct.FooStruct* %1) #2 + %field0 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 0 + %2 = load %struct.FooStruct*, %struct.FooStruct** %field0, align 4 + %3 = bitcast %struct.FooStruct* %arg0.addr.09 to i8* + tail call void @_Z4bar0Pv(i8* %3) #3 + tail call void @_Z4bar1Pv(i8* %3) #3 + %field2 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %2, i32 0, i32 2 + %4 = load i8, i8* %field2, align 1 + %lnot = icmp eq i8 %4, 0 + br i1 %lnot, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { optsize } +attributes #3 = { nounwind optsize } + diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent.ll @@ -0,0 +1,226 @@ +; RUN: opt -mergesimilarfunc -S %s -o - | FileCheck %s +; +; CHECK: define i8* @foo_a(%struct.a_type* %arg0) #1 { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label %for.cond + +; CHECK: for.cond: ; preds = %for.inc24, %entry +; CHECK-NEXT: %i.0 = phi i8 [ 0, %entry ], [ %inc25, %for.inc24 ] +; CHECK-NEXT: %ptr0.0 = phi i8* [ null, %entry ], [ %ptr0.3, %for.inc24 ] +; CHECK-NEXT: %conv = zext i8 %i.0 to i32 +; CHECK-NEXT: %cmp = icmp slt i32 %conv, 16 +; CHECK-NEXT: br i1 %cmp, label %for.body, label %for.end26 + +; CHECK: for.body: ; preds = %for.cond +; CHECK-NEXT: %call = call i8* @bar4(i32 %conv) #2 +; CHECK-NEXT: %call4 = call i8* @bar0(i32 %conv) #2 +; CHECK-NEXT: %0 = bitcast i8* %call4 to %struct.a_type* +; CHECK-NEXT: %call5 = call i32 @bar1(i8* %call) #2 +; CHECK-NEXT: %tobool = icmp ne i32 %call5, 0 +; CHECK-NEXT: br i1 %tobool, label %for.cond6, label %for.inc24 + +; CHECK: for.cond6: ; preds = %for.inc, %for.body +; CHECK-NEXT: %k.0 = phi i8 [ %inc, %for.inc ], [ 0, %for.body ] +; CHECK-NEXT: %ptr0.1 = phi i8* [ %ptr0.2, %for.inc ], [ %call, %for.body ] +; CHECK-NEXT: %idxprom = zext i8 %k.0 to i32 +; CHECK-NEXT: %field4 = getelementptr inbounds %struct.a_type, %struct.a_type* %arg0, i32 0, i32 4 +; CHECK-NEXT: %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* %field4, i32 0, i32 %idxprom +; CHECK-NEXT: %1 = load i8*, i8** %arrayidx, align 4 +; CHECK-NEXT: %cmp7 = icmp ne i8* %1, null +; CHECK-NEXT: br i1 %cmp7, label %land.rhs, label %for.inc24 + +; CHECK: land.rhs: ; preds = %for.cond6 +; CHECK-NEXT: %field410 = getelementptr inbounds %struct.a_type, %struct.a_type* %0, i32 0, i32 4 +; CHECK-NEXT: %arrayidx11 = getelementptr inbounds [2 x i8*], [2 x i8*]* %field410, i32 0, i32 %idxprom +; CHECK-NEXT: %2 = load i8*, i8** %arrayidx11, align 4 +; CHECK-NEXT: %cmp12 = icmp ne i8* %2, null +; CHECK-NEXT: br i1 %cmp12, label %for.body14, label %for.inc24 + +; CHECK: for.body14: ; preds = %land.rhs +; CHECK-NEXT: %3 = bitcast %struct.a_type* %0 to i8* +; CHECK-NEXT: %4 = bitcast %struct.a_type* %arg0 to i8* +; CHECK-NEXT: %call15 = call i32 @bar2(i8* %3, i8* %4) #2 +; CHECK-NEXT: %tobool16 = icmp ne i32 %call15, 0 +; CHECK-NEXT: br i1 %tobool16, label %if.then17, label %for.inc + +; CHECK: if.then17: ; preds = %for.body14 +; CHECK-NEXT: %call18 = call i32 @bar3(i8* %3, i8* %4) #2 +; CHECK-NEXT: %tobool19 = icmp ne i32 %call18, 0 +; CHECK-NEXT: br i1 %tobool19, label %if.then20, label %for.inc + +; CHECK: if.then20: ; preds = %if.then17 +; CHECK-NEXT: br label %for.inc + +; CHECK: for.inc: ; preds = %if.then20, %if.then17, %for.body14 +; CHECK-NEXT: %ptr0.2 = phi i8* [ null, %if.then20 ], [ %ptr0.1, %if.then17 ], [ null, %for.body14 ] +; CHECK-NEXT: %inc = add i8 %k.0, 1 +; CHECK-NEXT: br label %for.cond6 + +; CHECK: for.inc24: ; preds = %land.rhs, %for.cond6, %for.body +; CHECK-NEXT: %ptr0.3 = phi i8* [ %ptr0.1, %land.rhs ], [ %ptr0.1, %for.cond6 ], [ null, %for.body ] +; CHECK-NEXT: %inc25 = add i8 %i.0, 1 +; CHECK-NEXT: br label %for.cond + +; CHECK: for.end26: ; preds = %for.cond +; CHECK-NEXT: ret i8* %ptr0.0 +; CHECK-NEXT: } + +; CHECK: ; Function Attrs: nounwind optsize ssp +; CHECK-NEXT: define i8* @foo_b(%struct.b_type*) #1 { +; CHECK-NEXT: %2 = bitcast %struct.b_type* %0 to %struct.a_type* +; CHECK-NEXT: %3 = tail call i8* @foo_a(%struct.a_type* %2) +; CHECK-NEXT: ret i8* %3 +; CHECK-NEXT: } + +; CHECK: attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +; CHECK-NEXT: attributes #1 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +; CHECK-NEXT: attributes #2 = { optsize } + +target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32" + +%struct.a_type = type { i32, i32, i32, i8*, [2 x i8*] } +%struct.b_type = type { i32, i32, i32, i8*, [2 x i8*], i32 } + +; Function Attrs: optsize +declare i8* @bar4(i32) #1 +declare i8* @bar0(i32) #1 +declare i32 @bar1(i8*) #1 +declare i32 @bar2(i8*, i8*) #1 +declare i32 @bar3(i8*, i8*) #1 + +; Function Attrs: nounwind optsize ssp +define i8* @foo_a(%struct.a_type* %arg0) #0 { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc24, %entry + %i.0 = phi i8 [ 0, %entry ], [ %inc25, %for.inc24 ] + %ptr0.0 = phi i8* [ null, %entry ], [ %ptr0.3, %for.inc24 ] + %conv = zext i8 %i.0 to i32 + %cmp = icmp slt i32 %conv, 16 + br i1 %cmp, label %for.body, label %for.end26 + +for.body: ; preds = %for.cond + %call = call i8* @bar4(i32 %conv) #2 + %call4 = call i8* @bar0(i32 %conv) #2 + %0 = bitcast i8* %call4 to %struct.a_type* + %call5 = call i32 @bar1(i8* %call) #2 + %tobool = icmp ne i32 %call5, 0 + br i1 %tobool, label %for.cond6, label %for.inc24 + +for.cond6: ; preds = %for.body, %for.inc + %k.0 = phi i8 [ %inc, %for.inc ], [ 0, %for.body ] + %ptr0.1 = phi i8* [ %ptr0.2, %for.inc ], [ %call, %for.body ] + %idxprom = zext i8 %k.0 to i32 + %field4 = getelementptr inbounds %struct.a_type, %struct.a_type* %arg0, i32 0, i32 4 + %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* %field4, i32 0, i32 %idxprom + %1 = load i8*, i8** %arrayidx, align 4 + %cmp7 = icmp ne i8* %1, null + br i1 %cmp7, label %land.rhs, label %for.inc24 + +land.rhs: ; preds = %for.cond6 + %field410 = getelementptr inbounds %struct.a_type, %struct.a_type* %0, i32 0, i32 4 + %arrayidx11 = getelementptr inbounds [2 x i8*], [2 x i8*]* %field410, i32 0, i32 %idxprom + %2 = load i8*, i8** %arrayidx11, align 4 + %cmp12 = icmp ne i8* %2, null + br i1 %cmp12, label %for.body14, label %for.inc24 + +for.body14: ; preds = %land.rhs + %3 = bitcast %struct.a_type* %0 to i8* + %4 = bitcast %struct.a_type* %arg0 to i8* + %call15 = call i32 @bar2(i8* %3, i8* %4) #2 + %tobool16 = icmp ne i32 %call15, 0 + br i1 %tobool16, label %if.then17, label %for.inc + +if.then17: ; preds = %for.body14 + %call18 = call i32 @bar3(i8* %3, i8* %4) #2 + %tobool19 = icmp ne i32 %call18, 0 + br i1 %tobool19, label %if.then20, label %for.inc + +if.then20: ; preds = %if.then17 + br label %for.inc + +for.inc: ; preds = %for.body14, %if.then17, %if.then20 + %ptr0.2 = phi i8* [ null, %if.then20 ], [ %ptr0.1, %if.then17 ], [ null, %for.body14 ] + %inc = add i8 %k.0, 1 + br label %for.cond6 + +for.inc24: ; preds = %for.body, %for.cond6, %land.rhs + %ptr0.3 = phi i8* [ %ptr0.1, %land.rhs ], [ %ptr0.1, %for.cond6 ], [ null, %for.body ] + %inc25 = add i8 %i.0, 1 + br label %for.cond + +for.end26: ; preds = %for.cond + ret i8* %ptr0.0 +} + +; Function Attrs: nounwind optsize ssp +define i8* @foo_b(%struct.b_type* %arg0) #0 { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc24, %entry + %i.0 = phi i8 [ 0, %entry ], [ %inc25, %for.inc24 ] + %ptr0.0 = phi i8* [ null, %entry ], [ %ptr0.3, %for.inc24 ] + %conv = zext i8 %i.0 to i32 + %cmp = icmp slt i32 %conv, 16 + br i1 %cmp, label %for.body, label %for.end26 + +for.body: ; preds = %for.cond + %call = call i8* @bar4(i32 %conv) #2 + %call4 = call i8* @bar0(i32 %conv) #2 + %0 = bitcast i8* %call4 to %struct.b_type* + %call5 = call i32 @bar1(i8* %call) #2 + %tobool = icmp ne i32 %call5, 0 + br i1 %tobool, label %for.cond6, label %for.inc24 + +for.cond6: ; preds = %for.body, %for.inc + %k.0 = phi i8 [ %inc, %for.inc ], [ 0, %for.body ] + %ptr0.1 = phi i8* [ %ptr0.2, %for.inc ], [ %call, %for.body ] + %idxprom = zext i8 %k.0 to i32 + %field4 = getelementptr inbounds %struct.b_type, %struct.b_type* %arg0, i32 0, i32 4 + %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* %field4, i32 0, i32 %idxprom + %1 = load i8*, i8** %arrayidx, align 4 + %cmp7 = icmp ne i8* %1, null + br i1 %cmp7, label %land.rhs, label %for.inc24 + +land.rhs: ; preds = %for.cond6 + %field410 = getelementptr inbounds %struct.b_type, %struct.b_type* %0, i32 0, i32 4 + %arrayidx11 = getelementptr inbounds [2 x i8*], [2 x i8*]* %field410, i32 0, i32 %idxprom + %2 = load i8*, i8** %arrayidx11, align 4 + %cmp12 = icmp ne i8* %2, null + br i1 %cmp12, label %for.body14, label %for.inc24 + +for.body14: ; preds = %land.rhs + %3 = bitcast %struct.b_type* %0 to i8* + %4 = bitcast %struct.b_type* %arg0 to i8* + %call15 = call i32 @bar2(i8* %3, i8* %4) #2 + %tobool16 = icmp ne i32 %call15, 0 + br i1 %tobool16, label %if.then17, label %for.inc + +if.then17: ; preds = %for.body14 + %call18 = call i32 @bar3(i8* %3, i8* %4) #2 + %tobool19 = icmp ne i32 %call18, 0 + br i1 %tobool19, label %if.then20, label %for.inc + +if.then20: ; preds = %if.then17 + br label %for.inc + +for.inc: ; preds = %for.body14, %if.then17, %if.then20 + %ptr0.2 = phi i8* [ null, %if.then20 ], [ %ptr0.1, %if.then17 ], [ null, %for.body14 ] + %inc = add i8 %k.0, 1 + br label %for.cond6 + +for.inc24: ; preds = %for.body, %for.cond6, %land.rhs + %ptr0.3 = phi i8* [ %ptr0.1, %land.rhs ], [ %ptr0.1, %for.cond6 ], [ null, %for.body ] + %inc25 = add i8 %i.0, 1 + br label %for.cond + +for.end26: ; preds = %for.cond + ret i8* %ptr0.0 +} + +attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { optsize } + diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-noinline.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-noinline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MergeSimilarFunc/merge-noinline.ll @@ -0,0 +1,25 @@ +; RUN: opt -S -mergesimilarfunc -mergesimilarfunc-level=all < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +declare void @stuff() + +; CHECK-LABEL: @f0( +define void @f0(i64 %p0) { +entry: + call void @stuff() + call void @stuff() + call void @stuff() + ret void +} + +; CHECK-LABEL: @f1( +; CHECK: call void @f0{{.*}} #[[ATTR:[0-9]+]] +; CHECK: attributes #[[ATTR]] = { {{.*}}noinline +define void @f1(i64 %p0) { +entry: + call void @stuff() + call void @stuff() + call void @stuff() + ret void +} + diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-ret.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-ret.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MergeSimilarFunc/merge-ret.ll @@ -0,0 +1,207 @@ +; Check that the ret instructions are merged correctly. A bug caused +; an incorrect merge and a verifier failure for this input. +; +; RUN: opt -S -mergesimilarfunc < %s | FileCheck %s +; +; CHECK-LABEL: define internal %0* @LLVMGetReturnType__merged +; CHECK: phi %0* [ +; CHECK-NEXT: ret %0* +; CHECK-NEXT: } +; + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" + +%0 = type opaque +%1 = type { %2*, i32, i32, %1** } +%2 = type { %3* } +%3 = type opaque +%4 = type { %1 } +%5 = type opaque +%6 = type { i32 (...)**, %1*, %7*, i8, i8, i16, i32 } +%7 = type { %6*, %7*, %8 } +%8 = type { i32 } +%9 = type { %10, %1*, i32, %12, i32, i8, %18* } +%10 = type { %11 } +%11 = type { %6 } +%12 = type { %13 } +%13 = type { %14 } +%14 = type { %15 } +%15 = type { %16 } +%16 = type { %17 } +%17 = type { i32, i32, i8* } +%18 = type <{ %2*, %19, %30, %70, %79, %87, %12, %93*, %94, %98, %12, %12, %12, i8*, %102, i8, [3 x i8] }> +%19 = type { %20 } +%20 = type { %21, %26* } +%21 = type { %22 } +%22 = type { %23 } +%23 = type { %24 } +%24 = type { %25, %26* } +%25 = type { %26* } +%26 = type <{ %27, [3 x i8], %24, i8, [3 x i8] }> +%27 = type <{ %9, %12, %28*, %12, %12, i8 }> +%28 = type <{ %29*, i8, [3 x i8] }> +%29 = type opaque +%30 = type { %31 } +%31 = type { %32, %37* } +%32 = type { %33 } +%33 = type { %34 } +%34 = type { %35 } +%35 = type { %36, %37* } +%36 = type { %37* } +%37 = type { %27, %35, %38, %60, %93*, %68, %4* } +%38 = type { %39 } +%39 = type { %40, %44* } +%40 = type { %41 } +%41 = type { %42 } +%42 = type { %43 } +%43 = type { %44* } +%44 = type { %6, %45, i32, %47, %37* } +%45 = type { %46 } +%46 = type { %43, %44* } +%47 = type { %48 } +%48 = type { %49, %53* } +%49 = type { %50 } +%50 = type { %51 } +%51 = type { %52 } +%52 = type { %53* } +%53 = type { %11, %54, %44*, %56 } +%54 = type { %55 } +%55 = type { %52, %53* } +%56 = type { %57 } +%57 = type { %58 } +%58 = type { %59* } +%59 = type { i8, i8, i16, i32 } +%60 = type { %61 } +%61 = type { %62, %66* } +%62 = type { %63 } +%63 = type { %64 } +%64 = type { %65 } +%65 = type { %66* } +%66 = type { %6, %67, %37* } +%67 = type { %65, %66* } +%68 = type { %69* } +%69 = type opaque +%70 = type { %71 } +%71 = type { %72, %77* } +%72 = type { %73 } +%73 = type { %74 } +%74 = type { %75 } +%75 = type { %76, %77* } +%76 = type { %77* } +%77 = type { %78, %75 } +%78 = type { %9 } +%79 = type { %80 } +%80 = type { %81, %86* } +%81 = type { %82 } +%82 = type { %83 } +%83 = type { %84 } +%84 = type { %85, %86* } +%85 = type { %86* } +%86 = type { %78, %84 } +%87 = type { %88 } +%88 = type { %89, %92* } +%89 = type { %90 } +%90 = type { %91, %92* } +%91 = type { %92* } +%92 = type { %90, %12, %18*, i8* } +%93 = type opaque +%94 = type <{ %95, %97, [3 x i8] }> +%95 = type { %96**, i32, i32, i32, i32 } +%96 = type { i32 } +%97 = type { i8 } +%98 = type { %99 } +%99 = type { %100 } +%100 = type { %101* } +%101 = type opaque +%102 = type { i8, i32, i8, [3 x i8], %103, %111, %12, %118, i8* } +%103 = type { %104, %110 } +%104 = type { %105 } +%105 = type { %106 } +%106 = type <{ %107, %108 }> +%107 = type { i8*, i8*, i8* } +%108 = type { %109 } +%109 = type { [1 x i8] } +%110 = type { [7 x %108] } +%111 = type { %112, %117 } +%112 = type { %113 } +%113 = type { %114 } +%114 = type { %107, %115 } +%115 = type { %116 } +%116 = type { [8 x i8] } +%117 = type { [15 x %115] } +%118 = type { %119, %124 } +%119 = type { %120 } +%120 = type { %121 } +%121 = type { %107, %122 } +%122 = type { %123 } +%123 = type { [16 x i8] } +%124 = type { [7 x %122] } + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #0 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #0 + +; Function Attrs: optsize +define %0* @LLVMGetReturnType(%0*) #1 { + %2 = alloca %1*, align 4 + %3 = bitcast %1** %2 to i8* + call void @llvm.lifetime.start(i64 4, i8* %3) + %4 = bitcast %1** %2 to %0** + store %0* %0, %0** %4, align 4 + %5 = call zeroext i1 @_ZN4llvm13isa_impl_wrapINS_12FunctionTypeEKPNS_4TypeEPKS2_E4doitERS4_(%1** nonnull dereferenceable(4) %2) #3 + %6 = bitcast %0* %0 to %4* + br i1 %5, label %8, label %7 + +;